Created and Edited by Ezz Maree & Saja Abed Elhadi.
Importing libraries and functions that we'll be using
import warnings
warnings.filterwarnings('ignore')
# data and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# pre-processing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn_pandas import DataFrameMapper #Normalization
# prediction
# model building
from sklearn.model_selection import GridSearchCV
# model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import plot_confusion_matrix
# random forest classifier
from sklearn.ensemble import RandomForestClassifier
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# adaboost
from sklearn.ensemble import AdaBoostRegressor
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
#SVR
from sklearn.svm import SVR
# random forest regressor
from sklearn.ensemble import RandomForestRegressor
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
# SVM
from sklearn import svm
# PCA
from sklearn.decomposition import PCA
#isomap
from sklearn.manifold import Isomap
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from yellowbrick.regressor import residuals_plot
from sklearn import linear_model
from yellowbrick.regressor import PredictionError
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# set seed for reproducible results
RSEED = 10
In this section, We Will explore and visualize the data and its variables to get a better understanding of what is required in the pre-processing section.
# import us_landarea data
us_landarea = pd.read_csv('data/CountyData/5296US_landarea.csv')
us_landarea.sample(5, random_state=RSEED)
| FIPS | State | County | Land Area | |
|---|---|---|---|---|
| 2081 | 39145 | Ohio | Scioto | 1585.9 |
| 485 | 13261 | Georgia | Sumter | 1257.0 |
| 1758 | 34037 | New Jersey | Sussex | 1350.0 |
| 1309 | 27061 | Minnesota | Itasca | 6903.1 |
| 1249 | 26107 | Michigan | Mecosta | 1439.5 |
us_landarea.shape
(3111, 4)
us_landarea.dtypes
FIPS int64 State object County object Land Area float64 dtype: object
us_landarea.isnull().sum()
FIPS 0 State 0 County 0 Land Area 0 dtype: int64
since fips is represented as int instead of string or object the zeros at the beginning will be deleted and because of that we might lose some data when we merge the dataframes
# convert the type of fips to string and fill the missing zeros at the beginning
us_landarea['FIPS'] = us_landarea['FIPS'].astype(str)
us_landarea['FIPS'] = us_landarea['FIPS'].str.zfill(5)
us_landarea.nlargest(10, 'Land Area')
| FIPS | State | County | Land Area | |
|---|---|---|---|---|
| 192 | 06071 | California | San Bernardino | 51960.0 |
| 69 | 04005 | Arizona | Coconino | 48223.6 |
| 1725 | 32023 | Nevada | Nye | 47001.4 |
| 1717 | 32007 | Nevada | Elko | 44500.2 |
| 75 | 04015 | Arizona | Mohave | 34479.2 |
| 67 | 04001 | Arizona | Apache | 29022.7 |
| 1722 | 32017 | Nevada | Lincoln | 27543.8 |
| 3106 | 56037 | Wyoming | Sweetwater | 27003.0 |
| 170 | 06027 | California | Inyo | 26397.5 |
| 2186 | 41025 | Oregon | Harney | 26249.4 |
We notice that the County of San Bernandino in the State of Califorina has the largest Land Area
s = us_landarea.groupby(['State'])['County'].count().reset_index(name='Num Of Counties')
s_top5 = s.nlargest(5, 'Num Of Counties')
plt.figure(figsize=(8, 6))
plt.bar(s_top5['State'], s_top5['Num Of Counties'], color = '#b3cde3', edgecolor='black')
# plt.xticks([1, 2, 3, 4])
plt.title('Number of Counties by State', fontsize=14)
plt.xlabel('State', fontsize=12)
plt.ylabel('Number of Counties', fontsize=12)
sns.despine(top=True, right=True, bottom=False, left=True)
plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False)
plt.show()
We notice that the state of Texas has the most counties by a big margin followed by Georgia and Virginia
# import county_complete data
county_complete = pd.read_csv('data/GeneralDemographicData/county_complete.csv')
county_complete.sample(5, random_state=RSEED)
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2830 | 51021 | Virginia | Bland County | 6871.0 | 6824 | 6776.0 | 6695.0 | 6675.0 | 6587.0 | 6535.0 | ... | NaN | 1.0 | 5.4 | 7.5 | 0.0 | 8.1 | 10.9 | 6.8 | 95.0 | 93.9 |
| 2087 | 39089 | Ohio | Licking County | 145491.0 | 166492 | 167149.0 | 167568.0 | 168401.0 | 169295.0 | 170425.0 | ... | 15.1 | 2.9 | 3.9 | 6.4 | 0.1 | 4.9 | 3.5 | 9.5 | 91.4 | 90.1 |
| 25 | 1051 | Alabama | Elmore County | 65874.0 | 79303 | 80006.0 | 80220.0 | 80555.0 | 80562.0 | 80912.0 | ... | 15.2 | 1.7 | 4.0 | 7.4 | 0.0 | 3.1 | 2.3 | 10.6 | 75.3 | 73.2 |
| 1122 | 22019 | Louisiana | Calcasieu Parish | 183577.0 | 192768 | 193813.0 | 194506.0 | 195506.0 | 196641.0 | 198542.0 | ... | 25.3 | 2.4 | 4.9 | 9.2 | 0.1 | 2.3 | 1.8 | 7.9 | 70.0 | 67.4 |
| 108 | 4023 | Arizona | Santa Cruz County | 38381.0 | 47420 | 47615.0 | 47292.0 | 46845.0 | 46468.0 | 46191.0 | ... | 31.5 | 1.3 | 7.2 | 10.2 | 2.7 | 4.3 | 3.8 | 5.1 | 85.7 | 14.9 |
5 rows × 188 columns
county_complete.shape
(3142, 188)
county_complete.dtypes
fips int64
state object
name object
pop2000 float64
pop2010 int64
...
uninsured_under_19_2019 float64
uninsured_under_6_2019 float64
veterans_2019 float64
white_2019 float64
white_not_hispanic_2019 float64
Length: 188, dtype: object
113 features with null values with poverty catching all the spots in the top 3 for most null values, we'll make sure to preprocess the data later on the preprocessing phase
# convert fips column to string and add the missing 0 at the begining
county_complete['fips'] = county_complete['fips'].astype(str)
county_complete['fips'] = county_complete['fips'].str.zfill(5)
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
import plotly.express as px
fig = px.choropleth(county_complete, geojson=counties, locations='fips', color='unemployment_rate_2017',
color_continuous_scale="Viridis",
range_color=(0, 12),
scope="usa",
labels={'unemployment_rate_2017':'Unemployment Rate', 'name':'County'},
hover_data=["name"]
)
fig.update_layout(
title_text = 'Unemployment Rate By Counties 2017<br>(Hover for breakdown)',
)
fig.show()
fig = px.choropleth(county_complete, geojson=counties, locations='fips', color='unemployment_rate_2019',
color_continuous_scale="Viridis",
range_color=(0, 12),
scope="usa",
labels={'unemployment_rate_2019':'Unemployment Rate', 'name':'County'},
hover_data=["name"]
)
fig.update_layout(
title_text = 'Unemployment rate among those ages 20-64 (2015-2019).<br>(Hover for breakdown)',
)
fig.show()
merged_data1 = county_complete.copy().merge(right = us_landarea.copy(), left_on = ['fips'], right_on = ['FIPS'])
merged_data1
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | FIPS | State | County | Land Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 | 01001 | Alabama | Autauga | 1543.7 |
| 1 | 01003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 | 01003 | Alabama | Baldwin | 4135.0 |
| 2 | 01005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 | 01005 | Alabama | Barbour | 2292.1 |
| 3 | 01007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 | 01007 | Alabama | Bibb | 1611.9 |
| 4 | 01009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 | 01009 | Alabama | Blount | 1672.3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3100 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 | 56037 | Wyoming | Sweetwater | 27003.0 |
| 3101 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 | 56039 | Wyoming | Teton | 10380.6 |
| 3102 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 | 56041 | Wyoming | Uinta | 5391.7 |
| 3103 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 | 56043 | Wyoming | Washakie | 5802.0 |
| 3104 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 | 56045 | Wyoming | Weston | 6210.6 |
3105 rows × 192 columns
#remove duplicate columns after the merge
merged_data1 = merged_data1.drop(['FIPS', 'State', 'County'], 1)
merged_data1
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | Land Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 | 1543.7 |
| 1 | 01003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 | 4135.0 |
| 2 | 01005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 | 2292.1 |
| 3 | 01007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 | 1611.9 |
| 4 | 01009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 | 1672.3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3100 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 | 27003.0 |
| 3101 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 | 10380.6 |
| 3102 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 | 5391.7 |
| 3103 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 | 5802.0 |
| 3104 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 | 6210.6 |
3105 rows × 189 columns
# import life expectancy data
life_expectancy = pd.read_excel('data/LifeExpectancyData/IHME_USA_LIFE_EXPECTANCY_1987_2007_Y2011M06D16.XLSX')
life_expectancy.head()
| fips | State | County | Year | Male life expectancy (years) | Years behind international frontier (male) | Female life expectancy (years) | Years behind international frontier (female) | White male life expectancy (years) | White female life expectancy (years) | ... | Closest ranked countries for male life expectancy (lower) | Closest ranked countries for female life expectancy (lower) | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | ALABAMA | AUTAUGA | 1987 | 69.2 | 32 | 77.4 | 12 | 70.3 | 78.5 | ... | Albania,Bahrain,Guam,Jamaica,Macedonia, the Fo... | Guadeloupe,Israel,Malta,Montenegro,Portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 1001 | ALABAMA | AUTAUGA | 1988 | 69.3 | 32 | 77.3 | 14 | 70.5 | 78.4 | ... | Albania,Jamaica,Macedonia, the Former Yugoslav... | Barbados,Cuba,Mayotte,Reunion,Slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 1001 | ALABAMA | AUTAUGA | 1989 | 69.8 | 25 | 77.5 | 14 | 71.0 | 78.6 | ... | Albania,Bahrain,Guam,Macedonia, the Former Yug... | Barbados,Mayotte,Reunion,Singapore,Slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 1001 | ALABAMA | AUTAUGA | 1990 | 69.7 | 27 | 77.7 | 14 | 71.0 | 78.7 | ... | Albania,French Guiana,Macedonia, the Former Yu... | Barbados,Mayotte,Netherlands Antilles,Reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 1001 | ALABAMA | AUTAUGA | 1991 | 70.0 | 25 | 77.7 | 15 | 71.2 | 78.8 | ... | French Guiana,Macedonia, the Former Yugoslav R... | Barbados,Kuwait,Netherlands Antilles,Singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
5 rows × 24 columns
life_expectancy.shape
(66087, 24)
life_expectancy.dtypes
fips int64 State object County object Year int64 Male life expectancy (years) float64 Years behind international frontier (male) object Female life expectancy (years) float64 Years behind international frontier (female) object White male life expectancy (years) float64 White female life expectancy (years) float64 Black male life expectancy (years) float64 Black female life expectancy (years) float64 Closest ranked countries for male life expectancy (higher) object Closest ranked countries for female life expectancy (higher) object Closest ranked countries for male life expectancy (lower) object Closest ranked countries for female life expectancy (lower) object Rank (male) int64 Rank (female) int64 Male life expectancy change 1987 to 2007 (years) float64 Female life expectancy change 1987 to 2007 (years) float64 Male life expectancy change 1987 to 1997 (years) float64 Female life expectancy change 1987 to 1997 (years) float64 Male life expectancy change 1997 to 2007 (years) float64 Female life expectancy change 1997 to 2007 (years) float64 dtype: object
The columns: Years behind international frontier (male) and Years behind international frontier (female) have numeric values but are of type object, so we'll need to change thier types to numeric later on in the preprocessing phase
life_expectancy_null = life_expectancy.isnull().sum().reset_index(name='Null Count')
life_expectancy_null.head()
| index | Null Count | |
|---|---|---|
| 0 | fips | 0 |
| 1 | State | 0 |
| 2 | County | 0 |
| 3 | Year | 0 |
| 4 | Male life expectancy (years) | 0 |
life_expectancy_null = life_expectancy_null[life_expectancy_null['Null Count'] != 0]
life_expectancy_null
| index | Null Count | |
|---|---|---|
| 10 | Black male life expectancy (years) | 50593 |
| 11 | Black female life expectancy (years) | 47725 |
| 12 | Closest ranked countries for male life expecta... | 96 |
| 13 | Closest ranked countries for female life expec... | 11 |
Black male/female life expectancy (years) has approximatly a 100k missing values, and we'll need to fill out these missing value later on the preprocessing phase for more acurate models and results
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
filtered_df = life_expectancy[life_expectancy['State'].isin(state_list)]
ax = filtered_df.pivot_table(values="Rank (male)",index="Year",columns="State").plot()
ax.set_title("Rank(male) over the years")
ax.set_xlabel("Year")
ax.set_ylabel("Rank(male)")
ax.locator_params(integer=True)
The Rank for males in the state of California is noticeably getting lower over the years, same thing with Florida but with a lesser degree, meanwhile The state of South Dakota is getting higher ranks and lastly Wyoming is fluctuating between going down and up over the years
# convert fips column to string and add the missing 0 at the begining
life_expectancy['fips'] = life_expectancy['fips'].astype(str)
life_expectancy['fips'] = life_expectancy['fips'].str.zfill(5)
merged_data2 = merged_data1.copy().merge(right = life_expectancy.copy(), left_on=['fips'], right_on=['fips'])
merged_data2
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | Closest ranked countries for male life expectancy (lower) | Closest ranked countries for female life expectancy (lower) | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Bahrain,Guam,Jamaica,Macedonia, the Fo... | Guadeloupe,Israel,Malta,Montenegro,Portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Jamaica,Macedonia, the Former Yugoslav... | Barbados,Cuba,Mayotte,Reunion,Slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Bahrain,Guam,Macedonia, the Former Yug... | Barbados,Mayotte,Reunion,Singapore,Slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,French Guiana,Macedonia, the Former Yu... | Barbados,Mayotte,Netherlands Antilles,Reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | French Guiana,Macedonia, the Former Yugoslav R... | Barbados,Kuwait,Netherlands Antilles,Singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 65200 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Finland,Korea, Republi... | Cuba,Denmark,Greece,Mayotte,Reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65201 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Chile,Denmark,Finland,Guadeloupe,Virgin Island... | Cuba,Denmark,Kuwait,Mayotte,Reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65202 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,French Guiana,Mayotte,Reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65203 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,Greece,Mayotte,Reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65204 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Channel Islands,Greece,Malta,United Kingdom,Un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
65205 rows × 212 columns
#remove duplicate columns after the merge
merged_data2 = merged_data2.drop(['State', 'County'], 1)
merged_data2
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | Closest ranked countries for male life expectancy (lower) | Closest ranked countries for female life expectancy (lower) | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Bahrain,Guam,Jamaica,Macedonia, the Fo... | Guadeloupe,Israel,Malta,Montenegro,Portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Jamaica,Macedonia, the Former Yugoslav... | Barbados,Cuba,Mayotte,Reunion,Slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,Bahrain,Guam,Macedonia, the Former Yug... | Barbados,Mayotte,Reunion,Singapore,Slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | Albania,French Guiana,Macedonia, the Former Yu... | Barbados,Mayotte,Netherlands Antilles,Reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | French Guiana,Macedonia, the Former Yugoslav R... | Barbados,Kuwait,Netherlands Antilles,Singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 65200 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Finland,Korea, Republi... | Cuba,Denmark,Greece,Mayotte,Reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65201 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Chile,Denmark,Finland,Guadeloupe,Virgin Island... | Cuba,Denmark,Kuwait,Mayotte,Reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65202 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,French Guiana,Mayotte,Reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65203 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,Greece,Mayotte,Reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 65204 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Channel Islands,Greece,Malta,United Kingdom,Un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
65205 rows × 210 columns
# import 1976-2020 house data
house_elections = pd.read_csv('data/USElectionResults19762020/1976-2020-house.csv')
house_elections.sample(5, random_state=RSEED)
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | runoff | special | candidate | party | writein | mode | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29058 | 2018 | NEW YORK | NY | 36 | 21 | 13 | US HOUSE | 8 | GEN | NaN | False | ERNEST C JOHNSON | CONSERVATIVE | False | TOTAL | 9997 | 191567 | False | 20220331 | False |
| 30797 | 2020 | SOUTH CAROLINA | SC | 45 | 57 | 48 | US HOUSE | 2 | GEN | False | False | ADAIR FORD BOROUGHS | DEMOCRAT | False | TOTAL | 155118 | 364215 | False | 20220331 | False |
| 22920 | 2010 | GEORGIA | GA | 13 | 58 | 44 | US HOUSE | 3 | GEN | NaN | False | WRITEIN | NaN | True | TOTAL | 3 | 242239 | False | 20220331 | False |
| 20574 | 2006 | NEW YORK | NY | 36 | 21 | 13 | US HOUSE | 1 | GEN | False | False | TIMOTHY H BISHOP | DEMOCRAT | False | TOTAL | 92546 | 179113 | False | 20220331 | True |
| 23974 | 2010 | VIRGINIA | VA | 51 | 54 | 40 | US HOUSE | 7 | GEN | NaN | False | WRITEIN | NaN | True | TOTAL | 413 | 233402 | False | 20220331 | False |
house_elections.shape
(31103, 20)
house_elections.dtypes
year int64 state object state_po object state_fips int64 state_cen int64 state_ic int64 office object district int64 stage object runoff object special bool candidate object party object writein bool mode object candidatevotes int64 totalvotes int64 unofficial bool version int64 fusion_ticket bool dtype: object
for this dataframe we're going to drop some features because we're not going to use them for building the required models, the features we're going to drop are: state_po, state_fips, state_cen, state_ic, stage, runoff, special, writein, mode, unofficial, version, fusion_ticket
house_elections = house_elections.drop(['state_po', 'state_fips', 'state_cen', 'state_ic', 'stage', 'runoff', 'special', 'writein', 'mode', 'unofficial', 'version', 'fusion_ticket'], 1)
house_elections
| year | state | office | district | candidate | party | candidatevotes | totalvotes | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | US HOUSE | 1 | BILL DAVENPORT | DEMOCRAT | 58906 | 157170 |
| 1 | 1976 | ALABAMA | US HOUSE | 1 | JACK EDWARDS | REPUBLICAN | 98257 | 157170 |
| 2 | 1976 | ALABAMA | US HOUSE | 1 | WRITEIN | NaN | 7 | 157170 |
| 3 | 1976 | ALABAMA | US HOUSE | 2 | J CAROLE KEAHEY | DEMOCRAT | 66288 | 156362 |
| 4 | 1976 | ALABAMA | US HOUSE | 2 | WILLIAM L "BILL" DICKINSON | REPUBLICAN | 90069 | 156362 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | WYOMING | US HOUSE | 0 | LYNNETTE GREY BULL | DEMOCRAT | 66576 | 278503 |
| 31099 | 2020 | WYOMING | US HOUSE | 0 | OVERVOTES | NaN | 1274 | 278503 |
| 31100 | 2020 | WYOMING | US HOUSE | 0 | RICHARD BRUBAKER | LIBERTARIAN | 10154 | 278503 |
| 31101 | 2020 | WYOMING | US HOUSE | 0 | UNDERVOTES | NaN | 6337 | 278503 |
| 31102 | 2020 | WYOMING | US HOUSE | 0 | WRITEIN | NaN | 525 | 278503 |
31103 rows × 8 columns
house_elections.isnull().sum()
year 0 state 0 office 0 district 0 candidate 0 party 3620 candidatevotes 0 totalvotes 0 dtype: int64
viz_df = house_elections[house_elections['year'] == 2020]
viz_df
| year | state | office | district | candidate | party | candidatevotes | totalvotes | |
|---|---|---|---|---|---|---|---|---|
| 29636 | 2020 | ALABAMA | US HOUSE | 1 | JAMES AVERHART | DEMOCRAT | 116949 | 329075 |
| 29637 | 2020 | ALABAMA | US HOUSE | 1 | JERRY CARL | REPUBLICAN | 211825 | 329075 |
| 29638 | 2020 | ALABAMA | US HOUSE | 1 | WRITEIN | NaN | 301 | 329075 |
| 29639 | 2020 | ALABAMA | US HOUSE | 2 | BARRY MOORE | REPUBLICAN | 197996 | 303569 |
| 29640 | 2020 | ALABAMA | US HOUSE | 2 | PHYLLIS HARVEY-HALL | DEMOCRAT | 105286 | 303569 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | WYOMING | US HOUSE | 0 | LYNNETTE GREY BULL | DEMOCRAT | 66576 | 278503 |
| 31099 | 2020 | WYOMING | US HOUSE | 0 | OVERVOTES | NaN | 1274 | 278503 |
| 31100 | 2020 | WYOMING | US HOUSE | 0 | RICHARD BRUBAKER | LIBERTARIAN | 10154 | 278503 |
| 31101 | 2020 | WYOMING | US HOUSE | 0 | UNDERVOTES | NaN | 6337 | 278503 |
| 31102 | 2020 | WYOMING | US HOUSE | 0 | WRITEIN | NaN | 525 | 278503 |
1467 rows × 8 columns
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
viz_df = viz_df[viz_df['state'].isin(state_list)]
viz_df
| year | state | office | district | candidate | party | candidatevotes | totalvotes | |
|---|---|---|---|---|---|---|---|---|
| 29695 | 2020 | CALIFORNIA | US HOUSE | 1 | AUDREY DENNEY | DEMOCRAT | 154073 | 358263 |
| 29696 | 2020 | CALIFORNIA | US HOUSE | 1 | DOUG LAMALFA | REPUBLICAN | 204190 | 358263 |
| 29697 | 2020 | CALIFORNIA | US HOUSE | 2 | DALE K MENSING | REPUBLICAN | 94320 | 388755 |
| 29698 | 2020 | CALIFORNIA | US HOUSE | 2 | JARED HUFFMAN | DEMOCRAT | 294435 | 388755 |
| 29699 | 2020 | CALIFORNIA | US HOUSE | 3 | JOHN GARAMENDI | DEMOCRAT | 176043 | 321988 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | WYOMING | US HOUSE | 0 | LYNNETTE GREY BULL | DEMOCRAT | 66576 | 278503 |
| 31099 | 2020 | WYOMING | US HOUSE | 0 | OVERVOTES | NaN | 1274 | 278503 |
| 31100 | 2020 | WYOMING | US HOUSE | 0 | RICHARD BRUBAKER | LIBERTARIAN | 10154 | 278503 |
| 31101 | 2020 | WYOMING | US HOUSE | 0 | UNDERVOTES | NaN | 6337 | 278503 |
| 31102 | 2020 | WYOMING | US HOUSE | 0 | WRITEIN | NaN | 525 | 278503 |
188 rows × 8 columns
# get the top 5 candidates with most votes
candidate_top5 = viz_df.groupby('candidate', as_index=False)['candidatevotes'].sum()
candidate_top5 = candidate_top5.nlargest(5, 'candidatevotes')
candidate_top5
| candidate | candidatevotes | |
|---|---|---|
| 21 | BARBARA LEE | 327863 |
| 57 | DUSTY JOHNSON | 321984 |
| 44 | DANIEL WEBSTER | 316979 |
| 96 | JOHN H RUTHERFORD | 308497 |
| 141 | NEAL P DUNN | 305337 |
plt.figure(figsize=(12, 6))
plt.bar(candidate_top5['candidate'], candidate_top5['candidatevotes'], color = '#b3cde3', edgecolor='black')
plt.title('Number of Votes by State - 2020', fontsize=14)
plt.xlabel('Candidate', fontsize=12)
plt.ylabel('Number of Votes', fontsize=12)
sns.despine(top=True, right=True, bottom=False, left=True)
plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False)
plt.show()
we can barely see any difference between the number of votes for the top 5 candidates in the year 2020
In this section, We're gonna apply several pre-processing methods to the data in order to prepare it for the models that we wish to apply in the next section.
merged_data2.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 65205 entries, 0 to 65204 Data columns (total 210 columns): # Column Dtype --- ------ ----- 0 fips object 1 state object 2 name object 3 pop2000 float64 4 pop2010 int64 5 pop2011 float64 6 pop2012 float64 7 pop2013 float64 8 pop2014 float64 9 pop2015 float64 10 pop2016 float64 11 pop2017 float64 12 age_under_5_2010 float64 13 age_under_5_2017 float64 14 age_under_18_2010 float64 15 age_over_65_2010 float64 16 age_over_65_2017 float64 17 median_age_2017 float64 18 female_2010 float64 19 white_2010 float64 20 black_2010 float64 21 black_2017 float64 22 native_2010 float64 23 native_2017 float64 24 asian_2010 float64 25 asian_2017 float64 26 pac_isl_2010 float64 27 pac_isl_2017 float64 28 other_single_race_2017 float64 29 two_plus_races_2010 float64 30 two_plus_races_2017 float64 31 hispanic_2010 float64 32 hispanic_2017 float64 33 white_not_hispanic_2010 float64 34 white_not_hispanic_2017 float64 35 speak_english_only_2017 float64 36 no_move_in_one_plus_year_2010 float64 37 foreign_born_2010 float64 38 foreign_spoken_at_home_2010 float64 39 women_16_to_50_birth_rate_2017 float64 40 hs_grad_2010 float64 41 hs_grad_2016 float64 42 hs_grad_2017 float64 43 some_college_2016 float64 44 some_college_2017 float64 45 bachelors_2010 float64 46 bachelors_2016 float64 47 bachelors_2017 float64 48 veterans_2010 int64 49 veterans_2017 float64 50 mean_work_travel_2010 float64 51 mean_work_travel_2017 float64 52 broadband_2017 float64 53 computer_2017 float64 54 housing_units_2010 int64 55 homeownership_2010 float64 56 housing_multi_unit_2010 float64 57 median_val_owner_occupied_2010 int64 58 households_2010 int64 59 households_2017 float64 60 persons_per_household_2010 float64 61 persons_per_household_2017 float64 62 per_capita_income_2010 int64 63 per_capita_income_2017 float64 64 metro_2013 float64 65 median_household_income_2010 int64 66 median_household_income_2016 float64 67 median_household_income_2017 float64 68 private_nonfarm_establishments_2009 int64 69 private_nonfarm_employment_2009 int64 70 percent_change_private_nonfarm_employment_2009 float64 71 nonemployment_establishments_2009 float64 72 firms_2007 float64 73 black_owned_firms_2007 float64 74 native_owned_firms_2007 float64 75 asian_owned_firms_2007 float64 76 pac_isl_owned_firms_2007 float64 77 hispanic_owned_firms_2007 float64 78 women_owned_firms_2007 float64 79 manufacturer_shipments_2007 float64 80 mercent_whole_sales_2007 float64 81 sales_2007 float64 82 sales_per_capita_2007 float64 83 accommodation_food_service_2007 float64 84 building_permits_2010 int64 85 fed_spending_2009 float64 86 area_2010 float64 87 density_2010 float64 88 smoking_ban_2010 object 89 poverty_2010 float64 90 poverty_2016 float64 91 poverty_2017 float64 92 poverty_age_under_5_2017 float64 93 poverty_age_under_18_2017 float64 94 civilian_labor_force_2007 float64 95 employed_2007 float64 96 unemployed_2007 float64 97 unemployment_rate_2007 float64 98 civilian_labor_force_2008 float64 99 employed_2008 float64 100 unemployed_2008 float64 101 unemployment_rate_2008 float64 102 civilian_labor_force_2009 float64 103 employed_2009 float64 104 unemployed_2009 float64 105 unemployment_rate_2009 float64 106 civilian_labor_force_2010 float64 107 employed_2010 float64 108 unemployed_2010 float64 109 unemployment_rate_2010 float64 110 civilian_labor_force_2011 float64 111 employed_2011 float64 112 unemployed_2011 float64 113 unemployment_rate_2011 float64 114 civilian_labor_force_2012 float64 115 employed_2012 float64 116 unemployed_2012 float64 117 unemployment_rate_2012 float64 118 civilian_labor_force_2013 float64 119 employed_2013 float64 120 unemployed_2013 float64 121 unemployment_rate_2013 float64 122 civilian_labor_force_2014 float64 123 employed_2014 float64 124 unemployed_2014 float64 125 unemployment_rate_2014 float64 126 civilian_labor_force_2015 float64 127 employed_2015 float64 128 unemployed_2015 float64 129 unemployment_rate_2015 float64 130 civilian_labor_force_2016 float64 131 employed_2016 float64 132 unemployed_2016 float64 133 unemployment_rate_2016 float64 134 uninsured_2017 float64 135 uninsured_age_under_6_2017 object 136 uninsured_age_under_19_2017 float64 137 uninsured_age_over_74_2017 float64 138 civilian_labor_force_2017 float64 139 employed_2017 float64 140 unemployed_2017 float64 141 unemployment_rate_2017 float64 142 age_over_18_2019 float64 143 age_over_65_2019 float64 144 age_over_85_2019 float64 145 age_under_5_2019 float64 146 asian_2019 float64 147 avg_family_size_2019 float64 148 bachelors_2019 float64 149 black_2019 float64 150 hispanic_2019 float64 151 household_has_broadband_2019 float64 152 household_has_computer_2019 float64 153 household_has_smartphone_2019 float64 154 households_2019 int64 155 households_speak_asian_or_pac_isl_2019 float64 156 households_speak_limited_english_2019 float64 157 households_speak_other_2019 float64 158 households_speak_other_indo_euro_lang_2019 float64 159 households_speak_spanish_2019 float64 160 housing_mobile_homes_2019 float64 161 housing_one_unit_structures_2019 float64 162 housing_two_unit_structures_2019 float64 163 hs_grad_2019 float64 164 mean_household_income_2019 int64 165 mean_work_travel_2019 float64 166 median_age_2019 float64 167 median_household_income_2019 int64 168 median_individual_income_2019 int64 169 median_individual_income_age_25plus_2019 int64 170 native_2019 float64 171 other_single_race_2019 float64 172 pac_isl_2019 float64 173 per_capita_income_2019 int64 174 persons_per_household_2019 float64 175 pop_2019 int64 176 poverty_2019 float64 177 poverty_65_and_over_2019 float64 178 poverty_under_18_2019 float64 179 two_plus_races_2019 float64 180 unemployment_rate_2019 float64 181 uninsured_2019 float64 182 uninsured_65_and_older_2019 float64 183 uninsured_under_19_2019 float64 184 uninsured_under_6_2019 float64 185 veterans_2019 float64 186 white_2019 float64 187 white_not_hispanic_2019 float64 188 Land Area float64 189 Year int64 190 Male life expectancy (years) float64 191 Years behind international frontier (male) object 192 Female life expectancy (years) float64 193 Years behind international frontier (female) object 194 White male life expectancy (years) float64 195 White female life expectancy (years) float64 196 Black male life expectancy (years) float64 197 Black female life expectancy (years) float64 198 Closest ranked countries for male life expectancy (higher) object 199 Closest ranked countries for female life expectancy (higher) object 200 Closest ranked countries for male life expectancy (lower) object 201 Closest ranked countries for female life expectancy (lower) object 202 Rank (male) int64 203 Rank (female) int64 204 Male life expectancy change 1987 to 2007 (years) float64 205 Female life expectancy change 1987 to 2007 (years) float64 206 Male life expectancy change 1987 to 1997 (years) float64 207 Female life expectancy change 1987 to 1997 (years) float64 208 Male life expectancy change 1997 to 2007 (years) float64 209 Female life expectancy change 1997 to 2007 (years) float64 dtypes: float64(179), int64(20), object(11) memory usage: 105.0+ MB
we notice that the feature 'uninsured_age_under_6_2017' is of type object while it has numeric values, so we're going to convert its type to numeric, same thing with the features 'Years behind international frontier (male)' and 'Years behind international frontier (female)'
merged_data2['uninsured_age_under_6_2017'].unique()
array(['1.1', '2.4', '4.1', '2.7', '3.9', '0', '1.5', '0.3', '6', '1.8',
'3.7', '1.7', '12.5', '4.3', '4.6', '2', '6.5', '5', '0.2', '4.5',
'2.9', '1.4', '1.9', '2.2', '8.3', '0.1', '7.9', '5.9', '2.3',
'0.9', '2.1', '1', '4', '14.4', '3.6', '2.5', '3', '5.2', '1.3',
'0.5', '3.5', '0.7', '0.6', '1.6', '3.4', '2.8', '9.7', '18.5',
'3.3', '9.2', '9.5', '5.4', '10.3', '5.1', '7.2', '8.9', '8.4',
'7.3', '10.4', '9.1', '6.9', '6.4', '7.8', '7.5', '0.4', '18', '7',
'5.5', '3.1', '3.8', '4.8', '5.7', '6.6', '4.4', '8.5', '3.2',
'2.6', '5.6', '15.7', '4.2', '4.9', '4.7', '16', '5.3', '10.2',
'15.2', '16.1', '9.8', '9.6', '11.3', '11.5', '11.2', '5.8', '9',
'13.9', '11.1', '6.1', '1.2', '13.8', '6.8', '15.1', '6.7', '9.9',
'7.7', '8.2', '10.1', '14.5', '7.4', '7.1', '8.1', '8.8', '9.3',
'10.8', '16.6', '18.4', '18.2', '10', '17.3', '8.6', '26.6',
'18.8', '12.1', '11.8', '13', '13.5', '16.9', '12.6', '21.5',
'14.2', '10.5', '12.9', '16.7', '8', '13.2', '12.2', '36.4',
'15.4', '10.9', '9.4', '8.7', '17.6', '12', '14.6', '17.1', '19.4',
'13.3', '0.8', '6.2', '6.3', '17.8', '35.3', '16.4', '36.8', '7.6',
'22.1', '11.4', '56.8', '13.4', '15', '20.7', '18.7', '17.4',
'12.3', '11.6', '56.2', '15.8', '21.1', '20.8', '22.7', '17.9',
'13.6', '11.9', '13.7', '15.9', '12.7', '16.2', '18.1', '18.9',
'16.3', '11', '20.6', '25.7', '35.6', '29.9', '22.3', '21.4',
'32.1', '25.8', '24.7', '14.7', '10.6', '10.7', '19.6', '16.8',
'15.3', '33.8', '25.3', '17', '19.8', '42.9', '36', '20', '17.5',
'21.6', '18.6', '33.9', '22.8', '30.6', '29.3', '20.2', '14.3',
'23.4', '14.8', '41.2', '23.6', '42.3', '19.1', '26', '22.9',
'55.6', '14.1', '24.1', '12.4', '12.8', '19.5', '19.7', '23.3',
'14.9', '25.2', '19.2', '23.5', '17.2', '21.2', '24.3', '22',
'11.7', '19.3', '24.8', '26.4', '15.6', '14', '25.4', '51', '25',
'48.5', '32.4', '25.5', '-', '21.8', '28.6', '18.3', '42.2',
'21.3', '13.1', '25.9', '24.4', '23', '39.9', '26.7', '28.1'],
dtype=object)
# replace "-" values with null
def change(x):
if x == "-":
return None
else:
return float(x)
merged_data2['uninsured_age_under_6_2017'] = merged_data2['uninsured_age_under_6_2017'].map(change)
merged_data2['uninsured_age_under_6_2017'].unique()
array([ 1.1, 2.4, 4.1, 2.7, 3.9, 0. , 1.5, 0.3, 6. , 1.8, 3.7,
1.7, 12.5, 4.3, 4.6, 2. , 6.5, 5. , 0.2, 4.5, 2.9, 1.4,
1.9, 2.2, 8.3, 0.1, 7.9, 5.9, 2.3, 0.9, 2.1, 1. , 4. ,
14.4, 3.6, 2.5, 3. , 5.2, 1.3, 0.5, 3.5, 0.7, 0.6, 1.6,
3.4, 2.8, 9.7, 18.5, 3.3, 9.2, 9.5, 5.4, 10.3, 5.1, 7.2,
8.9, 8.4, 7.3, 10.4, 9.1, 6.9, 6.4, 7.8, 7.5, 0.4, 18. ,
7. , 5.5, 3.1, 3.8, 4.8, 5.7, 6.6, 4.4, 8.5, 3.2, 2.6,
5.6, 15.7, 4.2, 4.9, 4.7, 16. , 5.3, 10.2, 15.2, 16.1, 9.8,
9.6, 11.3, 11.5, 11.2, 5.8, 9. , 13.9, 11.1, 6.1, 1.2, 13.8,
6.8, 15.1, 6.7, 9.9, 7.7, 8.2, 10.1, 14.5, 7.4, 7.1, 8.1,
8.8, 9.3, 10.8, 16.6, 18.4, 18.2, 10. , 17.3, 8.6, 26.6, 18.8,
12.1, 11.8, 13. , 13.5, 16.9, 12.6, 21.5, 14.2, 10.5, 12.9, 16.7,
8. , 13.2, 12.2, 36.4, 15.4, 10.9, 9.4, 8.7, 17.6, 12. , 14.6,
17.1, 19.4, 13.3, 0.8, 6.2, 6.3, 17.8, 35.3, 16.4, 36.8, 7.6,
22.1, 11.4, 56.8, 13.4, 15. , 20.7, 18.7, 17.4, 12.3, 11.6, 56.2,
15.8, 21.1, 20.8, 22.7, 17.9, 13.6, 11.9, 13.7, 15.9, 12.7, 16.2,
18.1, 18.9, 16.3, 11. , 20.6, 25.7, 35.6, 29.9, 22.3, 21.4, 32.1,
25.8, 24.7, 14.7, 10.6, 10.7, 19.6, 16.8, 15.3, 33.8, 25.3, 17. ,
19.8, 42.9, 36. , 20. , 17.5, 21.6, 18.6, 33.9, 22.8, 30.6, 29.3,
20.2, 14.3, 23.4, 14.8, 41.2, 23.6, 42.3, 19.1, 26. , 22.9, 55.6,
14.1, 24.1, 12.4, 12.8, 19.5, 19.7, 23.3, 14.9, 25.2, 19.2, 23.5,
17.2, 21.2, 24.3, 22. , 11.7, 19.3, 24.8, 26.4, 15.6, 14. , 25.4,
51. , 25. , 48.5, 32.4, 25.5, nan, 21.8, 28.6, 18.3, 42.2, 21.3,
13.1, 25.9, 24.4, 23. , 39.9, 26.7, 28.1])
merged_data2['Years behind international frontier (male)'].unique()
array([32, 25, 27, 24, 23, 28, 26, 12, 11, 14, 16, 15, 13, 18, 19, '50+',
44, 43, 37, 36, 33, 35, 34, 38, 39, 40, 41, 45, 46, 17, 20, 21, 22,
47, 50, 31, 29, 30, 42, 49, 48, 10, 9, 8, 7, 5, -4, -6, -5, -1, 3,
4, 2, 6, 1, 0, -2, -3, -7, -8, -11, -13, -9, -10, -12, -14, -16,
-15], dtype=object)
# replace 50+ values with 50 and values which are less than 0 by 0
def change(x):
if x == '50+':
return 50.0
elif x < 0:
return 0.0
else:
return float(x)
merged_data2['Years behind international frontier (male)'] = merged_data2['Years behind international frontier (male)'].map(change)
merged_data2['Years behind international frontier (male)'].unique()
array([32., 25., 27., 24., 23., 28., 26., 12., 11., 14., 16., 15., 13.,
18., 19., 50., 44., 43., 37., 36., 33., 35., 34., 38., 39., 40.,
41., 45., 46., 17., 20., 21., 22., 47., 31., 29., 30., 42., 49.,
48., 10., 9., 8., 7., 5., 0., 3., 4., 2., 6., 1.])
merged_data2['Years behind international frontier (female)'].unique()
array([12, 14, 15, 16, 17, 19, 21, 22, 24, 26, 28, 30, 7, 8, 10, 11, 18,
20, 23, 25, 27, 31, 33, 34, 35, 9, 13, 29, 32, 36, 37, 38, 39, 41,
43, 6, 40, 42, 44, 45, 3, 4, -4, -5, -3, -2, 1, 5, 2, 0, -1, -6,
-12, -16, 46, 48, 49, 50, '50+', 47], dtype=object)
merged_data2['Years behind international frontier (female)'] = merged_data2['Years behind international frontier (female)'].map(change)
merged_data2['Years behind international frontier (female)'].unique()
array([12., 14., 15., 16., 17., 19., 21., 22., 24., 26., 28., 30., 7.,
8., 10., 11., 18., 20., 23., 25., 27., 31., 33., 34., 35., 9.,
13., 29., 32., 36., 37., 38., 39., 41., 43., 6., 40., 42., 44.,
45., 3., 4., 0., 1., 5., 2., 46., 48., 49., 50., 47.])
we're gonna drop the fips columns since it is an id type column which is not needed for the model
merged_data2 = merged_data2.drop('fips', 1)
# before dropping null values
house_elections.isnull().sum()
year 0 state 0 office 0 district 0 candidate 0 party 3620 candidatevotes 0 totalvotes 0 dtype: int64
since the null values are of category type and not alot of them are null we're going to drop them
house_elections = house_elections.dropna()
# after dropping null values
house_elections.isnull().sum()
year 0 state 0 office 0 district 0 candidate 0 party 0 candidatevotes 0 totalvotes 0 dtype: int64
"Imputation preserves all cases by replacing missing data with an estimated value based on other available information. Once all missing values have been imputed, the data set can then be analysed using standard techniques for complete data."
columns_with_null = merged_data2.columns[merged_data2.isna().any()].tolist()
columns_with_null
['black_2010', 'native_2010', 'asian_2010', 'pac_isl_2010', 'percent_change_private_nonfarm_employment_2009', 'nonemployment_establishments_2009', 'firms_2007', 'black_owned_firms_2007', 'native_owned_firms_2007', 'asian_owned_firms_2007', 'pac_isl_owned_firms_2007', 'hispanic_owned_firms_2007', 'women_owned_firms_2007', 'manufacturer_shipments_2007', 'mercent_whole_sales_2007', 'sales_2007', 'sales_per_capita_2007', 'accommodation_food_service_2007', 'fed_spending_2009', 'smoking_ban_2010', 'poverty_age_under_5_2017', 'uninsured_age_under_6_2017', 'mean_work_travel_2019', 'poverty_2019', 'poverty_65_and_over_2019', 'poverty_under_18_2019', 'Black male life expectancy (years)', 'Black female life expectancy (years)', 'Closest ranked countries for male life expectancy (higher)', 'Closest ranked countries for female life expectancy (higher)']
len(columns_with_null)
30
# create different datasets for numeric and categorical variables
numeric_features= merged_data2.select_dtypes(include='number')
categorical_features = merged_data2.select_dtypes(include='object')
# drop categorical variables
merged_data2.drop(categorical_features.columns, axis=1, inplace=True)
Since we're going to impute the data using knn algorithm it is important to first normalize the data as to represent the different data features on the same scale which in turn is going to improve the overall performance of the model for predicting missing values
#Normalize the data
mapper = DataFrameMapper([(merged_data2.columns, StandardScaler())])
scaled_features = mapper.fit_transform(merged_data2.copy(), 4)
merged_data2_scaled = pd.DataFrame(scaled_features, index=merged_data2.index, columns=merged_data2.columns)
merged_data2_scaled = merged_data2_scaled.join(categorical_features[['state']])
merged_data2_scaled = merged_data2_scaled.join(categorical_features[['name']])
merged_data2_scaled
| pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | pop2016 | pop2017 | age_under_5_2010 | ... | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | state | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.156799 | -0.139325 | -0.138658 | -0.140431 | -0.142048 | -0.142561 | -0.143710 | -0.143615 | -0.144345 | 0.304693 | ... | 1.211598 | 1.187277 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | 0.921856 | -0.913288 | Alabama | Autauga County |
| 1 | -0.156799 | -0.139325 | -0.138658 | -0.140431 | -0.142048 | -0.142561 | -0.143710 | -0.143615 | -0.144345 | 0.304693 | ... | 1.169856 | 1.220208 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | 0.921856 | -0.913288 | Alabama | Autauga County |
| 2 | -0.156799 | -0.139325 | -0.138658 | -0.140431 | -0.142048 | -0.142561 | -0.143710 | -0.143615 | -0.144345 | 0.304693 | ... | 1.033645 | 1.160933 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | 0.921856 | -0.913288 | Alabama | Autauga County |
| 3 | -0.156799 | -0.139325 | -0.138658 | -0.140431 | -0.142048 | -0.142561 | -0.143710 | -0.143615 | -0.144345 | 0.304693 | ... | 1.102849 | 1.109341 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | 0.921856 | -0.913288 | Alabama | Autauga County |
| 4 | -0.156799 | -0.139325 | -0.138658 | -0.140431 | -0.142048 | -0.142561 | -0.143710 | -0.143615 | -0.144345 | 0.304693 | ... | 1.046827 | 1.137881 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | 0.921856 | -0.913288 | Alabama | Autauga County |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 65200 | -0.284010 | -0.291482 | -0.291347 | -0.290933 | -0.290245 | -0.289907 | -0.289546 | -0.289684 | -0.291078 | -0.482025 | ... | -0.685468 | -0.911514 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | 0.016454 | 1.354572 | Wyoming | Weston County |
| 65201 | -0.284010 | -0.291482 | -0.291347 | -0.290933 | -0.290245 | -0.289907 | -0.289546 | -0.289684 | -0.291078 | -0.482025 | ... | -0.880996 | -0.846750 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | 0.016454 | 1.354572 | Wyoming | Weston County |
| 65202 | -0.284010 | -0.291482 | -0.291347 | -0.290933 | -0.290245 | -0.289907 | -0.289546 | -0.289684 | -0.291078 | -0.482025 | ... | -0.766755 | -0.919198 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | 0.016454 | 1.354572 | Wyoming | Weston County |
| 65203 | -0.284010 | -0.291482 | -0.291347 | -0.290933 | -0.290245 | -0.289907 | -0.289546 | -0.289684 | -0.291078 | -0.482025 | ... | -0.810694 | -1.115685 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | 0.016454 | 1.354572 | Wyoming | Weston County |
| 65204 | -0.284010 | -0.291482 | -0.291347 | -0.290933 | -0.290245 | -0.289907 | -0.289546 | -0.289684 | -0.291078 | -0.482025 | ... | -0.938117 | -1.302293 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | 0.016454 | 1.354572 | Wyoming | Weston County |
65205 rows × 204 columns
# missing values by variable
merged_data2_scaled.isna().sum().sort_values(ascending = False).head(10)
pac_isl_owned_firms_2007 63840 native_owned_firms_2007 55356 asian_owned_firms_2007 50043 Black male life expectancy (years) 50015 black_owned_firms_2007 49266 hispanic_owned_firms_2007 49014 Black female life expectancy (years) 47108 poverty_under_18_2019 38829 poverty_65_and_over_2019 38829 poverty_2019 38829 dtype: int64
# use KNN to impute missing values
imputer = KNNImputer(n_neighbors=5, weights="distance")
imputed = imputer.fit_transform(merged_data2_scaled.drop(['state', 'name'], 1))
# replace values in the dataset with new imputed values
merged_data2_scaled[merged_data2_scaled.drop(['state', 'name'], 1).columns] = imputed
# missing values by variable
merged_data2_scaled.isna().sum().sort_values(ascending = False).head(10)
pop2000 0 age_over_85_2019 0 uninsured_2017 0 uninsured_age_under_6_2017 0 uninsured_age_under_19_2017 0 uninsured_age_over_74_2017 0 civilian_labor_force_2017 0 employed_2017 0 unemployed_2017 0 unemployment_rate_2017 0 dtype: int64
Feature Selection helps us improve the machine learning model we're going to use by eliminating redundant and irrelevant features and it is generaly used to maximize relevance and minimize redundancy
# Create correlation matrix
corr_matrix = merged_data2_scaled.drop(['state', 'name'], 1).corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
# Drop features
merged_data2_scaled.drop(to_drop, axis=1, inplace=True)
# dropped columns
to_drop
['pop2010', 'pop2011', 'pop2012', 'pop2013', 'pop2014', 'pop2015', 'pop2016', 'pop2017', 'age_under_5_2017', 'age_under_18_2010', 'age_over_65_2017', 'median_age_2017', 'black_2010', 'black_2017', 'native_2017', 'asian_2017', 'hispanic_2017', 'white_not_hispanic_2010', 'white_not_hispanic_2017', 'speak_english_only_2017', 'foreign_born_2010', 'foreign_spoken_at_home_2010', 'hs_grad_2016', 'hs_grad_2017', 'some_college_2017', 'bachelors_2016', 'bachelors_2017', 'veterans_2010', 'mean_work_travel_2017', 'computer_2017', 'housing_units_2010', 'households_2010', 'households_2017', 'per_capita_income_2017', 'median_household_income_2010', 'median_household_income_2016', 'median_household_income_2017', 'private_nonfarm_establishments_2009', 'private_nonfarm_employment_2009', 'nonemployment_establishments_2009', 'firms_2007', 'black_owned_firms_2007', 'asian_owned_firms_2007', 'hispanic_owned_firms_2007', 'manufacturer_shipments_2007', 'mercent_whole_sales_2007', 'sales_2007', 'accommodation_food_service_2007', 'fed_spending_2009', 'poverty_2016', 'poverty_2017', 'poverty_age_under_5_2017', 'poverty_age_under_18_2017', 'civilian_labor_force_2007', 'employed_2007', 'unemployed_2007', 'civilian_labor_force_2008', 'employed_2008', 'unemployed_2008', 'unemployment_rate_2008', 'civilian_labor_force_2009', 'employed_2009', 'unemployed_2009', 'unemployment_rate_2009', 'civilian_labor_force_2010', 'employed_2010', 'unemployed_2010', 'unemployment_rate_2010', 'civilian_labor_force_2011', 'employed_2011', 'unemployed_2011', 'unemployment_rate_2011', 'civilian_labor_force_2012', 'employed_2012', 'unemployed_2012', 'unemployment_rate_2012', 'civilian_labor_force_2013', 'employed_2013', 'unemployed_2013', 'unemployment_rate_2013', 'civilian_labor_force_2014', 'employed_2014', 'unemployed_2014', 'unemployment_rate_2014', 'civilian_labor_force_2015', 'employed_2015', 'unemployed_2015', 'unemployment_rate_2015', 'civilian_labor_force_2016', 'employed_2016', 'unemployed_2016', 'unemployment_rate_2016', 'uninsured_age_under_19_2017', 'civilian_labor_force_2017', 'employed_2017', 'unemployed_2017', 'unemployment_rate_2017', 'age_over_18_2019', 'age_over_65_2019', 'age_under_5_2019', 'asian_2019', 'avg_family_size_2019', 'bachelors_2019', 'black_2019', 'hispanic_2019', 'household_has_broadband_2019', 'household_has_computer_2019', 'household_has_smartphone_2019', 'households_2019', 'households_speak_asian_or_pac_isl_2019', 'households_speak_limited_english_2019', 'households_speak_spanish_2019', 'housing_mobile_homes_2019', 'housing_two_unit_structures_2019', 'hs_grad_2019', 'mean_household_income_2019', 'mean_work_travel_2019', 'median_age_2019', 'median_household_income_2019', 'median_individual_income_2019', 'median_individual_income_age_25plus_2019', 'native_2019', 'other_single_race_2019', 'per_capita_income_2019', 'persons_per_household_2019', 'pop_2019', 'poverty_2019', 'poverty_under_18_2019', 'two_plus_races_2019', 'uninsured_2019', 'uninsured_under_19_2019', 'veterans_2019', 'white_2019', 'white_not_hispanic_2019', 'Land Area', 'Years behind international frontier (male)', 'Female life expectancy (years)', 'White male life expectancy (years)', 'White female life expectancy (years)', 'Black male life expectancy (years)', 'Black female life expectancy (years)', 'Rank (male)', 'Rank (female)', 'Male life expectancy change 1997 to 2007 (years)', 'Female life expectancy change 1997 to 2007 (years)']
# updated dataframe
merged_data2_scaled
| pop2000 | age_under_5_2010 | age_over_65_2010 | female_2010 | white_2010 | native_2010 | asian_2010 | pac_isl_2010 | pac_isl_2017 | other_single_race_2017 | ... | uninsured_under_6_2019 | Year | Male life expectancy (years) | Years behind international frontier (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | state | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.156799 | 0.304693 | -0.956608 | 0.573592 | -0.294014 | -0.203794 | -0.091087 | 0.091194 | 0.064482 | -0.196320 | ... | -0.648982 | -1.651446 | -1.507547 | -0.363663 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | Alabama | Autauga County |
| 1 | -0.156799 | 0.304693 | -0.956608 | 0.573592 | -0.294014 | -0.203794 | -0.091087 | 0.091194 | 0.064482 | -0.196320 | ... | -0.648982 | -1.486301 | -1.463804 | -0.109769 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | Alabama | Autauga County |
| 2 | -0.156799 | 0.304693 | -0.956608 | 0.573592 | -0.294014 | -0.203794 | -0.091087 | 0.091194 | 0.064482 | -0.196320 | ... | -0.648982 | -1.321157 | -1.245086 | -0.109769 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | Alabama | Autauga County |
| 3 | -0.156799 | 0.304693 | -0.956608 | 0.573592 | -0.294014 | -0.203794 | -0.091087 | 0.091194 | 0.064482 | -0.196320 | ... | -0.648982 | -1.156012 | -1.288829 | -0.109769 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | Alabama | Autauga County |
| 4 | -0.156799 | 0.304693 | -0.956608 | 0.573592 | -0.294014 | -0.203794 | -0.091087 | 0.091194 | 0.064482 | -0.196320 | ... | -0.648982 | -0.990867 | -1.157598 | 0.017177 | 0.537747 | -0.566236 | -0.152898 | 0.179472 | Alabama | Autauga County |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 65200 | -0.284010 | -0.482025 | -0.012875 | -1.232841 | 0.750277 | -0.064154 | -0.381053 | 0.091194 | -0.160987 | -0.533812 | ... | 0.310728 | 0.990867 | 1.029582 | -0.109769 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | Wyoming | Weston County |
| 65201 | -0.284010 | -0.482025 | -0.012875 | -1.232841 | 0.750277 | -0.064154 | -0.381053 | 0.091194 | -0.160987 | -0.533812 | ... | 0.310728 | 1.156012 | 1.204556 | 0.017177 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | Wyoming | Weston County |
| 65202 | -0.284010 | -0.482025 | -0.012875 | -1.232841 | 0.750277 | -0.064154 | -0.381053 | 0.091194 | -0.160987 | -0.533812 | ... | 0.310728 | 1.321157 | 1.204556 | 0.017177 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | Wyoming | Weston County |
| 65203 | -0.284010 | -0.482025 | -0.012875 | -1.232841 | 0.750277 | -0.064154 | -0.381053 | 0.091194 | -0.160987 | -0.533812 | ... | 0.310728 | 1.486301 | 1.292043 | -0.109769 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | Wyoming | Weston County |
| 65204 | -0.284010 | -0.482025 | -0.012875 | -1.232841 | 0.750277 | -0.064154 | -0.381053 | 0.091194 | -0.160987 | -0.533812 | ... | 0.310728 | 1.651446 | 1.423274 | -0.236716 | 0.361729 | 0.494409 | 0.598760 | -0.926569 | Wyoming | Weston County |
65205 rows × 59 columns
"Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.
It uses the LAPACK implementation of the full SVD or a randomized truncated SVD by the method of Halko et al. 2009, depending on the shape of the input data and the number of components to extract." - scikit-learn.org
# df = merged_data2_scaled
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
numeric_features= df.select_dtypes(include='number')
X = df[numeric_features.columns]
pca = PCA(n_components=2)
components = pca.fit_transform(X)
fig = px.scatter(components, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
looking at the scatter plot above we can see that there is no clear separation between the clusters, we also notice that the cluster for South Dakota is kind of divided into tow clusters for some reason, moreover we can see that the clusters itersect with each other so there is a clear similarity between all of them. as for the cluster for South Dakota it can be looked at as an anomaly since it is the most different from other clusters with it being divided into tow clusters and the reason for that might be that in this specific state there was some outliers that the others didn't have, as for the most similar states we can see that the cluster for Wyoming is roughly centered in the cluster of Florida even though they're like 1790 miles aways from each other but the distance has little to no effect here since we didn't use such a feature but the features for both of them must be pretty similar or in other words on is like a subgroup of the other
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
Explained variation per principal component: [0.24543326 0.20686539]
by looking at the explained variation per principal component we can see that pc1 and pc2 together explain about 45 percent of variability.
The importance of each feature is reflected by the magnitude of the corresponding values in the eigenvectors (higher magnitude - higher importance)
first let's take a look at the values and their corresponding feature index in the eigenvector of PC1
pca_values1 = {}
pca_list1 = np.array(abs(pca.components_[0])).tolist()
for i in pca_list1:
pca_values1[abs(i)] = pca_list1.index(i)
pca_values1
{0.3699630305828455: 0,
0.031169914000983345: 1,
0.12528291034665306: 2,
0.05925868902325941: 3,
0.11253740125889117: 4,
0.05347168774854461: 5,
0.4180689573778869: 6,
0.15910926569200434: 7,
0.15335571372464307: 8,
0.19121421004022837: 9,
0.19098849857287814: 10,
0.12523181280275533: 11,
0.16004150979551263: 12,
0.06414068193410842: 13,
0.06969440842161877: 14,
0.01822275224631341: 15,
0.028906823070860437: 16,
0.11822041349364688: 17,
0.11196390151179335: 18,
0.08136854253115473: 19,
0.11079103046518078: 20,
0.1520619105739115: 21,
0.1695412661562914: 22,
0.31299911095801314: 23,
0.10317102319569753: 24,
0.14566240445946366: 25,
0.12387884055094664: 26,
0.12399055296175113: 27,
0.0040448273172473525: 28,
0.03146461787002884: 29,
0.007965149212082561: 30,
0.10252820841645592: 31,
0.04754411906669883: 32,
0.21989483685899128: 33,
0.06318176599730302: 34,
0.07263450975965997: 35,
0.020386953393289938: 36,
0.0910795159630159: 37,
0.02654873062238773: 38,
0.04038757571879795: 39,
0.05996222796817054: 40,
0.08906815475876562: 41,
0.009464767236301757: 42,
0.12920932135364452: 43,
0.11055186403092941: 44,
0.14522186176662766: 45,
0.00437439180148958: 46,
0.042591421662535406: 47,
0.05757135339828772: 48,
0.056785953192417506: 49,
0.0: 50,
0.05160224691678352: 51,
0.025635803653047785: 52,
0.1547968887493836: 53,
0.13668191362778015: 54,
0.12207832460827833: 55,
0.09019617120850286: 56}
now let's sort them in descending order which means from most important to least important
sorted_pca_values1 = dict(sorted(pca_values1.items(), key=lambda item: item[0], reverse=True))
sorted_pca_values1
{0.4180689573778869: 6,
0.3699630305828455: 0,
0.31299911095801314: 23,
0.21989483685899128: 33,
0.19121421004022837: 9,
0.19098849857287814: 10,
0.1695412661562914: 22,
0.16004150979551263: 12,
0.15910926569200434: 7,
0.1547968887493836: 53,
0.15335571372464307: 8,
0.1520619105739115: 21,
0.14566240445946366: 25,
0.14522186176662766: 45,
0.13668191362778015: 54,
0.12920932135364452: 43,
0.12528291034665306: 2,
0.12523181280275533: 11,
0.12399055296175113: 27,
0.12387884055094664: 26,
0.12207832460827833: 55,
0.11822041349364688: 17,
0.11253740125889117: 4,
0.11196390151179335: 18,
0.11079103046518078: 20,
0.11055186403092941: 44,
0.10317102319569753: 24,
0.10252820841645592: 31,
0.0910795159630159: 37,
0.09019617120850286: 56,
0.08906815475876562: 41,
0.08136854253115473: 19,
0.07263450975965997: 35,
0.06969440842161877: 14,
0.06414068193410842: 13,
0.06318176599730302: 34,
0.05996222796817054: 40,
0.05925868902325941: 3,
0.05757135339828772: 48,
0.056785953192417506: 49,
0.05347168774854461: 5,
0.05160224691678352: 51,
0.04754411906669883: 32,
0.042591421662535406: 47,
0.04038757571879795: 39,
0.03146461787002884: 29,
0.031169914000983345: 1,
0.028906823070860437: 16,
0.02654873062238773: 38,
0.025635803653047785: 52,
0.020386953393289938: 36,
0.01822275224631341: 15,
0.009464767236301757: 42,
0.007965149212082561: 30,
0.00437439180148958: 46,
0.0040448273172473525: 28,
0.0: 50}
now we take a look at the values and their corresponding feature index in the eigenvector of PC2
pca_values2 = {}
pca_list2 = np.array(abs(pca.components_[1])).tolist()
for j in pca_list2:
pca_values2[abs(j)] = pca_list2.index(j)
then as we did before we sort them in desending order which means from most important to least important
sorted_pca_values2 = dict(sorted(pca_values2.items(), key=lambda item: item[0], reverse=True))
sorted_pca_values2
{0.4162326449330077: 5,
0.299267580521829: 42,
0.2616869989692539: 47,
0.25019403364624826: 24,
0.24249097076702422: 25,
0.2297005903687704: 29,
0.21833498252920336: 1,
0.20783076305521903: 36,
0.186900974533724: 4,
0.18504564538765791: 38,
0.16997212695034358: 48,
0.16395520056692517: 40,
0.1591879163395894: 26,
0.15320566227007784: 2,
0.14237122247442516: 41,
0.13361876347992124: 37,
0.13214434572825728: 21,
0.13138217928069298: 46,
0.1305883354450262: 20,
0.11450274389491166: 54,
0.10241985389025904: 15,
0.09670547870338363: 55,
0.0954111730929797: 17,
0.0951514593819736: 53,
0.09018293318435022: 56,
0.08805880204920088: 32,
0.08641956143514624: 23,
0.07994873791412709: 8,
0.07974256260432372: 44,
0.07621680318397792: 51,
0.07206242869588084: 11,
0.07096197569536573: 18,
0.06963095713697386: 30,
0.06157596955931271: 10,
0.061090290155775266: 52,
0.05931599892752986: 43,
0.057183657309260276: 14,
0.046509500630423466: 22,
0.04253728118038031: 12,
0.041080720399442446: 49,
0.04094086667100821: 39,
0.03963806654920571: 6,
0.03757573322200981: 31,
0.03666622076770962: 34,
0.03654724598062173: 27,
0.034417726506711155: 3,
0.031192166955416645: 9,
0.026347385450704343: 33,
0.023613824763336948: 16,
0.019681918246060356: 35,
0.018878513271045203: 0,
0.015771770924882213: 7,
0.014394900068712795: 28,
0.001087520048745067: 13,
0.0010858416414943012: 45,
0.0005488320381748891: 19,
0.0: 50}
"pca.components_ is the set of all eigenvectors (aka loadings) for your projection space (one eigenvector for each principal component)."
and now since we have a sorted list of feature importances we take the most important features from pc1 and pc2 and apply PCA on them, same thing with the least important features
# filter dataframe by best features
best_cols = df.drop(['state','name'], 1).iloc[:, [0, 5, 6, 42]].columns
best_cols
Index(['pop2000', 'native_2010', 'asian_2010', 'households_speak_other_2019'], dtype='object')
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
# numeric_features= df.select_dtypes(include='number')
X = df[best_cols]
pca = PCA(n_components=2)
components = pca.fit_transform(X)
fig = px.scatter(components, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
using the best features that we got we can see that the clusters now have a clearer shape, kind of a linear-ish shape except for Wyoming which looks more like a circle shape cluster, but suprisingly enough we still didnt get a better separation between the clusters as a result of using the most effective features instead but thy're shape became clearer, a possible reason for that might be that even though these features got the highest eigenvalues still these values were kind of low and might explain why we didn't get the separation we were looking for, or simply put the clusters has no real separation between them since thy're too similar and have many things in common
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
Explained variation per principal component: [0.47482906 0.34639272]
we can see that the explained variance for pc1 and pc2 together explains about 82% of the data and it is no surprise since the original number of columns was reduce from a merely 4 columns into 2 which might explain the increase in variance
worst_cols = df.drop(['state','name'], 1).iloc[:, [19, 28, 50]].columns
worst_cols
Index(['mean_work_travel_2010',
'percent_change_private_nonfarm_employment_2009', 'Year'],
dtype='object')
# df = merged_data2_scaled
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
# numeric_features= df.select_dtypes(include='number')
X = df[worst_cols]
pca = PCA(n_components=2)
components = pca.fit_transform(X)
fig = px.scatter(components, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
this time using applying PCA to the least effective features that we previously found that there is no separation between the clusters and they look kind of mushed together and very similar to each other unlike the results we got from the using the most effective features which means that these features are pretty similar for each one of the four clusters which explains the similarity and lack of separation between them
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
Explained variation per principal component: [0.60102752 0.39897248]
to measure the goodness of separation we're going to use the Silhouette Coefficient. "Silhouette Coefficient or silhouette score is a metric used to calculate the goodness of a clustering technique. Its value ranges from -1 to 1.
1: Means clusters are well apart from each other and clearly distinguished.
0: Means clusters are indifferent, or we can say that the distance between clusters is not significant.
-1: Means clusters are assigned in the wrong way."
X = df[best_cols]
y = df['state']
#
# Instantiate the KMeans models
#
km = KMeans(n_clusters=4, random_state=42)
#
# Fit the KMeans model
#
km.fit_predict(X)
#
# Calculate Silhoutte Score
#
score = silhouette_score(X, km.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)
Silhouetter Score: 0.787
X = df[worst_cols]
y = df['state']
#
# Instantiate the KMeans models
#
km = KMeans(n_clusters=4, random_state=42)
#
# Fit the KMeans model
#
km.fit_predict(X)
#
# Calculate Silhoutte Score
#
score = silhouette_score(X, km.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)
Silhouetter Score: 0.384
Isomap Embedding. Non-linear dimensionality reduction through Isometric Mapping.
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
numeric_features= df.select_dtypes(include='number')
X = df[numeric_features.columns]
embedding = Isomap(n_components=2)
X_transformed = embedding.fit_transform(X)
fig = px.scatter(X_transformed, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
unlike pca we see a clearer separation between the clusters with isomap while using all the features, we also notice that South Dakota and California clusters are clearly separated, Wyoming and Florida are also clearly separated. like with pca there is also an anomaly in here which is Florida cluster being visually seen as tow distinct clusters instead of one, we don't notice any similarity between the clusters but we notice that some clusters are kind of sub clusters of others, for example the Florida cluster which is split into tow has one half inside the Califorina cluster
now as for the process to find the most and least effective features to separated the four states we're going to apply logisitc regression to find them then apply sorting and show on them visually and based on that we're going to take the most and least important of them and apply isomap on these specific feature again and see the results.
# logistic regression for feature importance
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
# define the model
model = LogisticRegression()
# fit the model
model.fit(X, df['state'])
# get importance
importance = model.coef_[0]
# summarize feature importance
feat_score = {}
for i,v in enumerate(importance):
# print('Feature: %0d, Score: %.5f' % (i,v))
feat_score[i] = v
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()
dict(sorted(feat_score.items(), key=lambda item: item[1], reverse=True))
{37: 0.7601477448524282,
23: 0.6649148016158248,
10: 0.6102023739133594,
34: 0.432604931534568,
16: 0.33388404840454267,
11: 0.3266526438635486,
6: 0.3157828307154359,
12: 0.28167915532322674,
9: 0.2702311727918338,
8: 0.2576433497567219,
47: 0.22413362340385196,
48: 0.20901851008875003,
19: 0.2053284954474627,
7: 0.19407845647192543,
0: 0.16419006916996712,
13: 0.12875212131681188,
2: 0.12010996808797697,
51: 0.09530764299750079,
45: 0.09133880604224999,
55: 0.08797891672800315,
41: 0.07878270985647758,
56: 0.05345339508218621,
53: 0.050193752852760044,
25: 0.0450820454347938,
31: 0.034189770191318226,
46: 0.0296638379315065,
5: 0.025701793404967203,
36: 0.017946071537840454,
44: 0.008216609363549699,
24: 0.007197245211149242,
52: 0.0012062215537009418,
50: 0.0,
35: -0.0004893800485823716,
29: -0.013364432187920857,
20: -0.02372655307270263,
43: -0.03335935806948628,
49: -0.040581606473141726,
26: -0.04302885514705587,
33: -0.061570872226172586,
14: -0.0625957899900142,
54: -0.09449095872040127,
3: -0.09905588149635984,
4: -0.10000385888130958,
42: -0.10142292912010131,
22: -0.10668966501167992,
32: -0.1151896645407766,
40: -0.12275799418514033,
17: -0.14366154233323206,
15: -0.1523124688370092,
21: -0.15891876759132334,
39: -0.16957208035887555,
28: -0.1757923579774233,
1: -0.1786958674102034,
30: -0.18442338660127844,
27: -0.21721736398000957,
18: -0.2608138882428683,
38: -0.3408574659810999}
best_cols_iso = df.drop(['state','name'], 1).iloc[:, [37,23,10,34,16]].columns
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
numeric_features= df.select_dtypes(include='number')
X = df[best_cols_iso]
embedding = Isomap(n_components=2)
X_transformed1 = embedding.fit_transform(X)
fig = px.scatter(X_transformed1, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
compared to the results we got so far these clusters are the most clearly separated which means that the features we got are indeed the most effective features to separate the four states, we can only see that California cluster is the most separated from others and being the biggest cluster of them, the other states like Florida, South Dakota and Wyoming are pretty similar in size and are also clearly separated and being somewhat similar, moreover we notice an intersection between the tow clusters for Florida and south Dakota
worst_cols_iso = df.drop(['state','name'], 1).iloc[:, [18,27,38]].columns
state_list = ['California','Florida','South Dakota','Wyoming']
df = merged_data2_scaled[merged_data2_scaled['state'].isin(state_list)]
df = df.groupby(['state', 'name'], as_index=False).mean()
numeric_features= df.select_dtypes(include='number')
X = df[worst_cols_iso]
embedding = Isomap(n_components=2)
X_transformed2 = embedding.fit_transform(X)
fig = px.scatter(X_transformed2, x=0, y=1, color=df['state'],
labels={
"0": "PC1",
"1": "PC2",
},
title="County Clusters by States")
fig.show()
as for the results so far we got here the worst separation, no clear shapes, all the points are mixed together no separation at all which means that we indeed got the least effective features to separate the clusters
# Instantiate the KMeans models
km = KMeans(n_clusters=4, random_state=RSEED)
# Fit the KMeans model
km.fit_predict(X_transformed1)
#
# Calculate Silhoutte Score
#
score = silhouette_score(X_transformed1, km.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)
Silhouetter Score: 0.580
# Instantiate the KMeans models
km = KMeans(n_clusters=4, random_state=RSEED)
# Fit the KMeans model
km.fit_predict(X_transformed2)
#
# Calculate Silhoutte Score
#
score = silhouette_score(X_transformed2, km.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)
Silhouetter Score: 0.574
To Create a measure as accurate as possible for voter turnout percentage by state for the years 2010 and 2012 we need the following features: year, total votes and VAP(voting age population), first of all we must filter that dataframe to get only the data for the years 2010 and 2012, then we group the candidate votes by state and year to get total votes, following that we calculate the VAP for the year 2010 using the columns pop2010 and age under 18, as for the year 2012 since we're missing the age over 18 column we estimate the increase of this value over the years and calculate it based on that, after we got the VAP for both years 2010 and 2012 we merge them with the filtered house elections dataframe and then divide total votes by VAP which gets us the voter turnout percentage
house_elections.isnull().sum()
year 0 state 0 office 0 district 0 candidate 0 party 0 candidatevotes 0 totalvotes 0 dtype: int64
year_list = [2010, 2012]
filtered_df = house_elections[house_elections['year'].isin(year_list)]
filtered_df
| year | state | office | district | candidate | party | candidatevotes | totalvotes | |
|---|---|---|---|---|---|---|---|---|
| 22553 | 2010 | ALABAMA | US HOUSE | 1 | DAVID WALTER | CONSTITUTION | 26357 | 156281 |
| 22554 | 2010 | ALABAMA | US HOUSE | 1 | JO BONNER | REPUBLICAN | 129063 | 156281 |
| 22556 | 2010 | ALABAMA | US HOUSE | 2 | BOBBY BRIGHT | DEMOCRAT | 106865 | 219028 |
| 22557 | 2010 | ALABAMA | US HOUSE | 2 | MARTHA ROBY | REPUBLICAN | 111645 | 219028 |
| 22559 | 2010 | ALABAMA | US HOUSE | 3 | MIKE ROGERS | REPUBLICAN | 117736 | 198139 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25473 | 2012 | WYOMING | US HOUSE | 0 | CHRIS HENRICHSEN | DEMOCRAT | 57573 | 250700 |
| 25474 | 2012 | WYOMING | US HOUSE | 0 | CYNTHIA M LUMMIS | REPUBLICAN | 166452 | 250700 |
| 25475 | 2012 | WYOMING | US HOUSE | 0 | DANIEL CLYDE CUMMINGS | CONSTITUTION | 4963 | 250700 |
| 25476 | 2012 | WYOMING | US HOUSE | 0 | DON WILLS | COUNTRY PARTY | 3775 | 250700 |
| 25478 | 2012 | WYOMING | US HOUSE | 0 | RICHARD P BRUBAKER | LIBERTARIAN | 8442 | 250700 |
2560 rows × 8 columns
# total votes
filtered_df = filtered_df.groupby(['year', 'state'], as_index=False)['candidatevotes'].sum()
filtered_df
| year | state | candidatevotes | |
|---|---|---|---|
| 0 | 2010 | ALABAMA | 1359759 |
| 1 | 2010 | ALASKA | 252990 |
| 2 | 2010 | ARIZONA | 1698135 |
| 3 | 2010 | ARKANSAS | 773866 |
| 4 | 2010 | CALIFORNIA | 9644560 |
| ... | ... | ... | ... |
| 95 | 2012 | VIRGINIA | 3733561 |
| 96 | 2012 | WASHINGTON | 3006266 |
| 97 | 2012 | WEST VIRGINIA | 641354 |
| 98 | 2012 | WISCONSIN | 2862341 |
| 99 | 2012 | WYOMING | 241205 |
100 rows × 3 columns
filtered_df.rename(columns = {'candidatevotes':'totalVotes'}, inplace = True)
filtered_df
| year | state | totalVotes | |
|---|---|---|---|
| 0 | 2010 | ALABAMA | 1359759 |
| 1 | 2010 | ALASKA | 252990 |
| 2 | 2010 | ARIZONA | 1698135 |
| 3 | 2010 | ARKANSAS | 773866 |
| 4 | 2010 | CALIFORNIA | 9644560 |
| ... | ... | ... | ... |
| 95 | 2012 | VIRGINIA | 3733561 |
| 96 | 2012 | WASHINGTON | 3006266 |
| 97 | 2012 | WEST VIRGINIA | 641354 |
| 98 | 2012 | WISCONSIN | 2862341 |
| 99 | 2012 | WYOMING | 241205 |
100 rows × 3 columns
county_complete.sample(5, random_state=RSEED)
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2830 | 51021 | Virginia | Bland County | 6871.0 | 6824 | 6776.0 | 6695.0 | 6675.0 | 6587.0 | 6535.0 | ... | NaN | 1.0 | 5.4 | 7.5 | 0.0 | 8.1 | 10.9 | 6.8 | 95.0 | 93.9 |
| 2087 | 39089 | Ohio | Licking County | 145491.0 | 166492 | 167149.0 | 167568.0 | 168401.0 | 169295.0 | 170425.0 | ... | 15.1 | 2.9 | 3.9 | 6.4 | 0.1 | 4.9 | 3.5 | 9.5 | 91.4 | 90.1 |
| 25 | 01051 | Alabama | Elmore County | 65874.0 | 79303 | 80006.0 | 80220.0 | 80555.0 | 80562.0 | 80912.0 | ... | 15.2 | 1.7 | 4.0 | 7.4 | 0.0 | 3.1 | 2.3 | 10.6 | 75.3 | 73.2 |
| 1122 | 22019 | Louisiana | Calcasieu Parish | 183577.0 | 192768 | 193813.0 | 194506.0 | 195506.0 | 196641.0 | 198542.0 | ... | 25.3 | 2.4 | 4.9 | 9.2 | 0.1 | 2.3 | 1.8 | 7.9 | 70.0 | 67.4 |
| 108 | 04023 | Arizona | Santa Cruz County | 38381.0 | 47420 | 47615.0 | 47292.0 | 46845.0 | 46468.0 | 46191.0 | ... | 31.5 | 1.3 | 7.2 | 10.2 | 2.7 | 4.3 | 3.8 | 5.1 | 85.7 | 14.9 |
5 rows × 188 columns
county_complete['pop2010'].isnull().values.any()
False
county_complete['age_under_18_2010'].isnull().values.any()
False
# VAP for 2010
county_complete['VAP2010'] = np.floor(county_complete['pop2010'] * ((100 - county_complete['age_under_18_2010']) / 100))
county_complete['VAP2010']
0 39945.0
1 140344.0
2 21443.0
3 17713.0
4 43220.0
...
3137 31934.0
3138 17226.0
3139 14740.0
3140 6357.0
3141 5636.0
Name: VAP2010, Length: 3142, dtype: float64
county_complete['age_over_18_2019'].isnull().values.any()
False
# Calculate VAP for 2012
county_complete['VAP2012'] = ((county_complete['age_over_18_2019'] - (100 - county_complete['age_under_18_2010'])) / 10 * 2) + 100 - county_complete['age_under_18_2010']
county_complete['VAP2012'] = county_complete['VAP2012'] / 100
county_complete['VAP2012'] = np.floor(county_complete['VAP2012'] * county_complete['pop2012'])
county_complete['VAP2012']
0 40536.0
1 146831.0
2 21278.0
3 17609.0
4 43587.0
...
3137 32889.0
3138 17562.0
3139 14699.0
3140 6288.0
3141 5543.0
Name: VAP2012, Length: 3142, dtype: float64
county_complete2 = county_complete.groupby(['state'], as_index=False)['VAP2010', 'VAP2012'].sum()
county_complete2 = county_complete2[county_complete2['state'] != 'District of Columbia'].reset_index()
county_complete2 = county_complete2.sort_values(by=['state'])
county_complete2 = county_complete2.drop('index', 1)
county_complete2
| state | VAP2010 | VAP2012 | |
|---|---|---|---|
| 0 | Alabama | 3646981.0 | 3684787.0 |
| 1 | Alaska | 522751.0 | 535397.0 |
| 2 | Arizona | 4763130.0 | 4905918.0 |
| 3 | Arkansas | 2204313.0 | 2234791.0 |
| 4 | California | 27955194.0 | 28680763.0 |
| 5 | Colorado | 3803001.0 | 3941163.0 |
| 6 | Connecticut | 2756819.0 | 2789596.0 |
| 7 | Delaware | 692372.0 | 709891.0 |
| 8 | Florida | 14797963.0 | 15270158.0 |
| 9 | Georgia | 7196177.0 | 7394822.0 |
| 10 | Hawaii | 1056446.0 | 1084314.0 |
| 11 | Idaho | 1138528.0 | 1163512.0 |
| 12 | Illinois | 9702168.0 | 9783173.0 |
| 13 | Indiana | 4874896.0 | 4929657.0 |
| 14 | Iowa | 2318473.0 | 2343874.0 |
| 15 | Kansas | 2126260.0 | 2156209.0 |
| 16 | Kentucky | 3315647.0 | 3357281.0 |
| 17 | Louisiana | 3415094.0 | 3476883.0 |
| 18 | Maine | 1053779.0 | 1058243.0 |
| 19 | Maryland | 4419968.0 | 4523759.0 |
| 20 | Massachusetts | 5129171.0 | 5239351.0 |
| 21 | Michigan | 7539072.0 | 7578532.0 |
| 22 | Minnesota | 4019316.0 | 4085045.0 |
| 23 | Mississippi | 2211757.0 | 2232845.0 |
| 24 | Missouri | 4563510.0 | 4603905.0 |
| 25 | Montana | 765792.0 | 778298.0 |
| 26 | Nebraska | 1366743.0 | 1389460.0 |
| 27 | Nevada | 2036187.0 | 2084445.0 |
| 28 | New Hampshire | 1029354.0 | 1039460.0 |
| 29 | New Jersey | 6725870.0 | 6820431.0 |
| 30 | New Mexico | 1540481.0 | 1566254.0 |
| 31 | New York | 15052653.0 | 15297720.0 |
| 32 | North Carolina | 7253813.0 | 7449864.0 |
| 33 | North Dakota | 522655.0 | 543720.0 |
| 34 | Ohio | 8806302.0 | 8844420.0 |
| 35 | Oklahoma | 2821701.0 | 2873141.0 |
| 36 | Oregon | 2965220.0 | 3026055.0 |
| 37 | Pennsylvania | 9909668.0 | 9990642.0 |
| 38 | Rhode Island | 828361.0 | 832044.0 |
| 39 | South Carolina | 3545098.0 | 3629956.0 |
| 40 | South Dakota | 611357.0 | 625545.0 |
| 41 | Tennessee | 4849154.0 | 4943511.0 |
| 42 | Texas | 18283803.0 | 19026859.0 |
| 43 | Utah | 1893012.0 | 1964476.0 |
| 44 | Vermont | 496428.0 | 498787.0 |
| 45 | Virginia | 6142207.0 | 6307975.0 |
| 46 | Washington | 5143903.0 | 5288778.0 |
| 47 | West Virginia | 1465505.0 | 1469645.0 |
| 48 | Wisconsin | 4348246.0 | 4390494.0 |
| 49 | Wyoming | 428209.0 | 438695.0 |
df_2010 = filtered_df[filtered_df['year'] == 2010]
df_2010
| year | state | totalVotes | |
|---|---|---|---|
| 0 | 2010 | ALABAMA | 1359759 |
| 1 | 2010 | ALASKA | 252990 |
| 2 | 2010 | ARIZONA | 1698135 |
| 3 | 2010 | ARKANSAS | 773866 |
| 4 | 2010 | CALIFORNIA | 9644560 |
| 5 | 2010 | COLORADO | 1763106 |
| 6 | 2010 | CONNECTICUT | 1138116 |
| 7 | 2010 | DELAWARE | 305636 |
| 8 | 2010 | FLORIDA | 5116018 |
| 9 | 2010 | GEORGIA | 2468489 |
| 10 | 2010 | HAWAII | 360121 |
| 11 | 2010 | IDAHO | 447144 |
| 12 | 2010 | ILLINOIS | 3696108 |
| 13 | 2010 | INDIANA | 1747640 |
| 14 | 2010 | IOWA | 1094452 |
| 15 | 2010 | KANSAS | 835529 |
| 16 | 2010 | KENTUCKY | 1354051 |
| 17 | 2010 | LOUISIANA | 1035948 |
| 18 | 2010 | MAINE | 564326 |
| 19 | 2010 | MARYLAND | 1823638 |
| 20 | 2010 | MASSACHUSETTS | 2219813 |
| 21 | 2010 | MICHIGAN | 3194857 |
| 22 | 2010 | MINNESOTA | 2089062 |
| 23 | 2010 | MISSISSIPPI | 788549 |
| 24 | 2010 | MISSOURI | 1919791 |
| 25 | 2010 | MONTANA | 360341 |
| 26 | 2010 | NEBRASKA | 465510 |
| 27 | 2010 | NEVADA | 702788 |
| 28 | 2010 | NEW HAMPSHIRE | 449787 |
| 29 | 2010 | NEW JERSEY | 2121584 |
| 30 | 2010 | NEW MEXICO | 596651 |
| 31 | 2010 | NEW YORK | 4484408 |
| 32 | 2010 | NORTH CAROLINA | 2662110 |
| 33 | 2010 | NORTH DAKOTA | 236344 |
| 34 | 2010 | OHIO | 3825014 |
| 35 | 2010 | OKLAHOMA | 792980 |
| 36 | 2010 | OREGON | 1427027 |
| 37 | 2010 | PENNSYLVANIA | 3956401 |
| 38 | 2010 | RHODE ISLAND | 335004 |
| 39 | 2010 | SOUTH CAROLINA | 1339410 |
| 40 | 2010 | SOUTH DAKOTA | 319426 |
| 41 | 2010 | TENNESSEE | 1559120 |
| 42 | 2010 | TEXAS | 4744189 |
| 43 | 2010 | UTAH | 640495 |
| 44 | 2010 | VERMONT | 238335 |
| 45 | 2010 | VIRGINIA | 2184271 |
| 46 | 2010 | WASHINGTON | 2479409 |
| 47 | 2010 | WEST VIRGINIA | 514373 |
| 48 | 2010 | WISCONSIN | 2138775 |
| 49 | 2010 | WYOMING | 186682 |
df_2010 = df_2010.sort_values(by=['state'])
df_2010
| year | state | totalVotes | |
|---|---|---|---|
| 0 | 2010 | ALABAMA | 1359759 |
| 1 | 2010 | ALASKA | 252990 |
| 2 | 2010 | ARIZONA | 1698135 |
| 3 | 2010 | ARKANSAS | 773866 |
| 4 | 2010 | CALIFORNIA | 9644560 |
| 5 | 2010 | COLORADO | 1763106 |
| 6 | 2010 | CONNECTICUT | 1138116 |
| 7 | 2010 | DELAWARE | 305636 |
| 8 | 2010 | FLORIDA | 5116018 |
| 9 | 2010 | GEORGIA | 2468489 |
| 10 | 2010 | HAWAII | 360121 |
| 11 | 2010 | IDAHO | 447144 |
| 12 | 2010 | ILLINOIS | 3696108 |
| 13 | 2010 | INDIANA | 1747640 |
| 14 | 2010 | IOWA | 1094452 |
| 15 | 2010 | KANSAS | 835529 |
| 16 | 2010 | KENTUCKY | 1354051 |
| 17 | 2010 | LOUISIANA | 1035948 |
| 18 | 2010 | MAINE | 564326 |
| 19 | 2010 | MARYLAND | 1823638 |
| 20 | 2010 | MASSACHUSETTS | 2219813 |
| 21 | 2010 | MICHIGAN | 3194857 |
| 22 | 2010 | MINNESOTA | 2089062 |
| 23 | 2010 | MISSISSIPPI | 788549 |
| 24 | 2010 | MISSOURI | 1919791 |
| 25 | 2010 | MONTANA | 360341 |
| 26 | 2010 | NEBRASKA | 465510 |
| 27 | 2010 | NEVADA | 702788 |
| 28 | 2010 | NEW HAMPSHIRE | 449787 |
| 29 | 2010 | NEW JERSEY | 2121584 |
| 30 | 2010 | NEW MEXICO | 596651 |
| 31 | 2010 | NEW YORK | 4484408 |
| 32 | 2010 | NORTH CAROLINA | 2662110 |
| 33 | 2010 | NORTH DAKOTA | 236344 |
| 34 | 2010 | OHIO | 3825014 |
| 35 | 2010 | OKLAHOMA | 792980 |
| 36 | 2010 | OREGON | 1427027 |
| 37 | 2010 | PENNSYLVANIA | 3956401 |
| 38 | 2010 | RHODE ISLAND | 335004 |
| 39 | 2010 | SOUTH CAROLINA | 1339410 |
| 40 | 2010 | SOUTH DAKOTA | 319426 |
| 41 | 2010 | TENNESSEE | 1559120 |
| 42 | 2010 | TEXAS | 4744189 |
| 43 | 2010 | UTAH | 640495 |
| 44 | 2010 | VERMONT | 238335 |
| 45 | 2010 | VIRGINIA | 2184271 |
| 46 | 2010 | WASHINGTON | 2479409 |
| 47 | 2010 | WEST VIRGINIA | 514373 |
| 48 | 2010 | WISCONSIN | 2138775 |
| 49 | 2010 | WYOMING | 186682 |
df_2010['voter_turnout'] = df_2010['totalVotes'] / county_complete2['VAP2010']
df_2010
| year | state | totalVotes | voter_turnout | |
|---|---|---|---|---|
| 0 | 2010 | ALABAMA | 1359759 | 0.372845 |
| 1 | 2010 | ALASKA | 252990 | 0.483959 |
| 2 | 2010 | ARIZONA | 1698135 | 0.356517 |
| 3 | 2010 | ARKANSAS | 773866 | 0.351069 |
| 4 | 2010 | CALIFORNIA | 9644560 | 0.345001 |
| 5 | 2010 | COLORADO | 1763106 | 0.463609 |
| 6 | 2010 | CONNECTICUT | 1138116 | 0.412837 |
| 7 | 2010 | DELAWARE | 305636 | 0.441433 |
| 8 | 2010 | FLORIDA | 5116018 | 0.345724 |
| 9 | 2010 | GEORGIA | 2468489 | 0.343028 |
| 10 | 2010 | HAWAII | 360121 | 0.340880 |
| 11 | 2010 | IDAHO | 447144 | 0.392739 |
| 12 | 2010 | ILLINOIS | 3696108 | 0.380957 |
| 13 | 2010 | INDIANA | 1747640 | 0.358498 |
| 14 | 2010 | IOWA | 1094452 | 0.472057 |
| 15 | 2010 | KANSAS | 835529 | 0.392957 |
| 16 | 2010 | KENTUCKY | 1354051 | 0.408382 |
| 17 | 2010 | LOUISIANA | 1035948 | 0.303344 |
| 18 | 2010 | MAINE | 564326 | 0.535526 |
| 19 | 2010 | MARYLAND | 1823638 | 0.412591 |
| 20 | 2010 | MASSACHUSETTS | 2219813 | 0.432782 |
| 21 | 2010 | MICHIGAN | 3194857 | 0.423773 |
| 22 | 2010 | MINNESOTA | 2089062 | 0.519756 |
| 23 | 2010 | MISSISSIPPI | 788549 | 0.356526 |
| 24 | 2010 | MISSOURI | 1919791 | 0.420683 |
| 25 | 2010 | MONTANA | 360341 | 0.470547 |
| 26 | 2010 | NEBRASKA | 465510 | 0.340598 |
| 27 | 2010 | NEVADA | 702788 | 0.345149 |
| 28 | 2010 | NEW HAMPSHIRE | 449787 | 0.436960 |
| 29 | 2010 | NEW JERSEY | 2121584 | 0.315436 |
| 30 | 2010 | NEW MEXICO | 596651 | 0.387315 |
| 31 | 2010 | NEW YORK | 4484408 | 0.297915 |
| 32 | 2010 | NORTH CAROLINA | 2662110 | 0.366995 |
| 33 | 2010 | NORTH DAKOTA | 236344 | 0.452199 |
| 34 | 2010 | OHIO | 3825014 | 0.434350 |
| 35 | 2010 | OKLAHOMA | 792980 | 0.281029 |
| 36 | 2010 | OREGON | 1427027 | 0.481255 |
| 37 | 2010 | PENNSYLVANIA | 3956401 | 0.399247 |
| 38 | 2010 | RHODE ISLAND | 335004 | 0.404418 |
| 39 | 2010 | SOUTH CAROLINA | 1339410 | 0.377820 |
| 40 | 2010 | SOUTH DAKOTA | 319426 | 0.522487 |
| 41 | 2010 | TENNESSEE | 1559120 | 0.321524 |
| 42 | 2010 | TEXAS | 4744189 | 0.259475 |
| 43 | 2010 | UTAH | 640495 | 0.338347 |
| 44 | 2010 | VERMONT | 238335 | 0.480100 |
| 45 | 2010 | VIRGINIA | 2184271 | 0.355617 |
| 46 | 2010 | WASHINGTON | 2479409 | 0.482009 |
| 47 | 2010 | WEST VIRGINIA | 514373 | 0.350987 |
| 48 | 2010 | WISCONSIN | 2138775 | 0.491871 |
| 49 | 2010 | WYOMING | 186682 | 0.435960 |
df_2012 = filtered_df[filtered_df['year'] == 2012]
df_2012
| year | state | totalVotes | |
|---|---|---|---|
| 50 | 2012 | ALABAMA | 1927122 |
| 51 | 2012 | ALASKA | 288840 |
| 52 | 2012 | ARIZONA | 2173259 |
| 53 | 2012 | ARKANSAS | 1038054 |
| 54 | 2012 | CALIFORNIA | 12204357 |
| 55 | 2012 | COLORADO | 2450488 |
| 56 | 2012 | CONNECTICUT | 1465487 |
| 57 | 2012 | DELAWARE | 388059 |
| 58 | 2012 | FLORIDA | 7512911 |
| 59 | 2012 | GEORGIA | 3552967 |
| 60 | 2012 | HAWAII | 422539 |
| 61 | 2012 | IDAHO | 634983 |
| 62 | 2012 | ILLINOIS | 5057772 |
| 63 | 2012 | INDIANA | 2553743 |
| 64 | 2012 | IOWA | 1535469 |
| 65 | 2012 | KANSAS | 1057739 |
| 66 | 2012 | KENTUCKY | 1737037 |
| 67 | 2012 | LOUISIANA | 1705617 |
| 68 | 2012 | MAINE | 693801 |
| 69 | 2012 | MARYLAND | 2579538 |
| 70 | 2012 | MASSACHUSETTS | 2879565 |
| 71 | 2012 | MICHIGAN | 4574615 |
| 72 | 2012 | MINNESOTA | 2807826 |
| 73 | 2012 | MISSISSIPPI | 1208175 |
| 74 | 2012 | MISSOURI | 2675885 |
| 75 | 2012 | MONTANA | 479740 |
| 76 | 2012 | NEBRASKA | 772515 |
| 77 | 2012 | NEVADA | 973742 |
| 78 | 2012 | NEW HAMPSHIRE | 682018 |
| 79 | 2012 | NEW JERSEY | 3281778 |
| 80 | 2012 | NEW MEXICO | 765458 |
| 81 | 2012 | NEW YORK | 6456343 |
| 82 | 2012 | NORTH CAROLINA | 4379666 |
| 83 | 2012 | NORTH DAKOTA | 315716 |
| 84 | 2012 | OHIO | 5140157 |
| 85 | 2012 | OKLAHOMA | 1325935 |
| 86 | 2012 | OREGON | 1705571 |
| 87 | 2012 | PENNSYLVANIA | 5556330 |
| 88 | 2012 | RHODE ISLAND | 427321 |
| 89 | 2012 | SOUTH CAROLINA | 1791578 |
| 90 | 2012 | SOUTH DAKOTA | 361429 |
| 91 | 2012 | TENNESSEE | 2283173 |
| 92 | 2012 | TEXAS | 7663983 |
| 93 | 2012 | UTAH | 998897 |
| 94 | 2012 | VERMONT | 289663 |
| 95 | 2012 | VIRGINIA | 3733561 |
| 96 | 2012 | WASHINGTON | 3006266 |
| 97 | 2012 | WEST VIRGINIA | 641354 |
| 98 | 2012 | WISCONSIN | 2862341 |
| 99 | 2012 | WYOMING | 241205 |
df_2012 = df_2012.sort_values(by=['state']).reset_index()
df_2012 = df_2012.drop('index', 1)
df_2012
| year | state | totalVotes | |
|---|---|---|---|
| 0 | 2012 | ALABAMA | 1927122 |
| 1 | 2012 | ALASKA | 288840 |
| 2 | 2012 | ARIZONA | 2173259 |
| 3 | 2012 | ARKANSAS | 1038054 |
| 4 | 2012 | CALIFORNIA | 12204357 |
| 5 | 2012 | COLORADO | 2450488 |
| 6 | 2012 | CONNECTICUT | 1465487 |
| 7 | 2012 | DELAWARE | 388059 |
| 8 | 2012 | FLORIDA | 7512911 |
| 9 | 2012 | GEORGIA | 3552967 |
| 10 | 2012 | HAWAII | 422539 |
| 11 | 2012 | IDAHO | 634983 |
| 12 | 2012 | ILLINOIS | 5057772 |
| 13 | 2012 | INDIANA | 2553743 |
| 14 | 2012 | IOWA | 1535469 |
| 15 | 2012 | KANSAS | 1057739 |
| 16 | 2012 | KENTUCKY | 1737037 |
| 17 | 2012 | LOUISIANA | 1705617 |
| 18 | 2012 | MAINE | 693801 |
| 19 | 2012 | MARYLAND | 2579538 |
| 20 | 2012 | MASSACHUSETTS | 2879565 |
| 21 | 2012 | MICHIGAN | 4574615 |
| 22 | 2012 | MINNESOTA | 2807826 |
| 23 | 2012 | MISSISSIPPI | 1208175 |
| 24 | 2012 | MISSOURI | 2675885 |
| 25 | 2012 | MONTANA | 479740 |
| 26 | 2012 | NEBRASKA | 772515 |
| 27 | 2012 | NEVADA | 973742 |
| 28 | 2012 | NEW HAMPSHIRE | 682018 |
| 29 | 2012 | NEW JERSEY | 3281778 |
| 30 | 2012 | NEW MEXICO | 765458 |
| 31 | 2012 | NEW YORK | 6456343 |
| 32 | 2012 | NORTH CAROLINA | 4379666 |
| 33 | 2012 | NORTH DAKOTA | 315716 |
| 34 | 2012 | OHIO | 5140157 |
| 35 | 2012 | OKLAHOMA | 1325935 |
| 36 | 2012 | OREGON | 1705571 |
| 37 | 2012 | PENNSYLVANIA | 5556330 |
| 38 | 2012 | RHODE ISLAND | 427321 |
| 39 | 2012 | SOUTH CAROLINA | 1791578 |
| 40 | 2012 | SOUTH DAKOTA | 361429 |
| 41 | 2012 | TENNESSEE | 2283173 |
| 42 | 2012 | TEXAS | 7663983 |
| 43 | 2012 | UTAH | 998897 |
| 44 | 2012 | VERMONT | 289663 |
| 45 | 2012 | VIRGINIA | 3733561 |
| 46 | 2012 | WASHINGTON | 3006266 |
| 47 | 2012 | WEST VIRGINIA | 641354 |
| 48 | 2012 | WISCONSIN | 2862341 |
| 49 | 2012 | WYOMING | 241205 |
df_2012['voter_turnout'] = df_2012['totalVotes'] / county_complete2['VAP2012']
df_2012
| year | state | totalVotes | voter_turnout | |
|---|---|---|---|---|
| 0 | 2012 | ALABAMA | 1927122 | 0.522994 |
| 1 | 2012 | ALASKA | 288840 | 0.539488 |
| 2 | 2012 | ARIZONA | 2173259 | 0.442987 |
| 3 | 2012 | ARKANSAS | 1038054 | 0.464497 |
| 4 | 2012 | CALIFORNIA | 12204357 | 0.425524 |
| 5 | 2012 | COLORADO | 2450488 | 0.621768 |
| 6 | 2012 | CONNECTICUT | 1465487 | 0.525340 |
| 7 | 2012 | DELAWARE | 388059 | 0.546646 |
| 8 | 2012 | FLORIDA | 7512911 | 0.492000 |
| 9 | 2012 | GEORGIA | 3552967 | 0.480467 |
| 10 | 2012 | HAWAII | 422539 | 0.389683 |
| 11 | 2012 | IDAHO | 634983 | 0.545747 |
| 12 | 2012 | ILLINOIS | 5057772 | 0.516987 |
| 13 | 2012 | INDIANA | 2553743 | 0.518037 |
| 14 | 2012 | IOWA | 1535469 | 0.655099 |
| 15 | 2012 | KANSAS | 1057739 | 0.490555 |
| 16 | 2012 | KENTUCKY | 1737037 | 0.517394 |
| 17 | 2012 | LOUISIANA | 1705617 | 0.490559 |
| 18 | 2012 | MAINE | 693801 | 0.655616 |
| 19 | 2012 | MARYLAND | 2579538 | 0.570220 |
| 20 | 2012 | MASSACHUSETTS | 2879565 | 0.549603 |
| 21 | 2012 | MICHIGAN | 4574615 | 0.603628 |
| 22 | 2012 | MINNESOTA | 2807826 | 0.687343 |
| 23 | 2012 | MISSISSIPPI | 1208175 | 0.541092 |
| 24 | 2012 | MISSOURI | 2675885 | 0.581221 |
| 25 | 2012 | MONTANA | 479740 | 0.616396 |
| 26 | 2012 | NEBRASKA | 772515 | 0.555982 |
| 27 | 2012 | NEVADA | 973742 | 0.467147 |
| 28 | 2012 | NEW HAMPSHIRE | 682018 | 0.656127 |
| 29 | 2012 | NEW JERSEY | 3281778 | 0.481169 |
| 30 | 2012 | NEW MEXICO | 765458 | 0.488719 |
| 31 | 2012 | NEW YORK | 6456343 | 0.422046 |
| 32 | 2012 | NORTH CAROLINA | 4379666 | 0.587885 |
| 33 | 2012 | NORTH DAKOTA | 315716 | 0.580659 |
| 34 | 2012 | OHIO | 5140157 | 0.581175 |
| 35 | 2012 | OKLAHOMA | 1325935 | 0.461493 |
| 36 | 2012 | OREGON | 1705571 | 0.563629 |
| 37 | 2012 | PENNSYLVANIA | 5556330 | 0.556153 |
| 38 | 2012 | RHODE ISLAND | 427321 | 0.513580 |
| 39 | 2012 | SOUTH CAROLINA | 1791578 | 0.493554 |
| 40 | 2012 | SOUTH DAKOTA | 361429 | 0.577783 |
| 41 | 2012 | TENNESSEE | 2283173 | 0.461853 |
| 42 | 2012 | TEXAS | 7663983 | 0.402798 |
| 43 | 2012 | UTAH | 998897 | 0.508480 |
| 44 | 2012 | VERMONT | 289663 | 0.580735 |
| 45 | 2012 | VIRGINIA | 3733561 | 0.591879 |
| 46 | 2012 | WASHINGTON | 3006266 | 0.568424 |
| 47 | 2012 | WEST VIRGINIA | 641354 | 0.436401 |
| 48 | 2012 | WISCONSIN | 2862341 | 0.651941 |
| 49 | 2012 | WYOMING | 241205 | 0.549824 |
We can calculate the voter turnout by dividing the number of people who voted by the number of people who can vote, now we already have the number of people who voted for each year from the years 1976-2020 but the second field which is the number of people who can vote which is the number of people above the age of 18 and to estimate this value we can use the formula: population size * percent of people above 18, we're missing population size values for the years 1976-1999 and 2001-2009 and percent of people above 18 for all the years except the year 2019
New Features:
df = house_elections.groupby(['year', 'state'], as_index=False)['candidatevotes'].sum()
df
| year | state | candidatevotes | |
|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 |
| 1 | 1976 | ALASKA | 117916 |
| 2 | 1976 | ARIZONA | 729002 |
| 3 | 1976 | ARKANSAS | 336383 |
| 4 | 1976 | CALIFORNIA | 7442501 |
| ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 |
| 1147 | 2020 | WASHINGTON | 3885792 |
| 1148 | 2020 | WEST VIRGINIA | 761385 |
| 1149 | 2020 | WISCONSIN | 3235981 |
| 1150 | 2020 | WYOMING | 270367 |
1151 rows × 3 columns
df.rename(columns = {'candidatevotes':'totalVotes'}, inplace = True)
df
| year | state | totalVotes | |
|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 |
| 1 | 1976 | ALASKA | 117916 |
| 2 | 1976 | ARIZONA | 729002 |
| 3 | 1976 | ARKANSAS | 336383 |
| 4 | 1976 | CALIFORNIA | 7442501 |
| ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 |
| 1147 | 2020 | WASHINGTON | 3885792 |
| 1148 | 2020 | WEST VIRGINIA | 761385 |
| 1149 | 2020 | WISCONSIN | 3235981 |
| 1150 | 2020 | WYOMING | 270367 |
1151 rows × 3 columns
features6 = county_complete.copy()
features6
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | VAP2010 | VAP2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 | 39945.0 | 40536.0 |
| 1 | 01003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 | 140344.0 | 146831.0 |
| 2 | 01005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 | 21443.0 | 21278.0 |
| 3 | 01007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 | 17713.0 | 17609.0 |
| 4 | 01009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 | 43220.0 | 43587.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 | 31934.0 | 32889.0 |
| 3138 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 | 17226.0 | 17562.0 |
| 3139 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 | 14740.0 | 14699.0 |
| 3140 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 | 6357.0 | 6288.0 |
| 3141 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 | 5636.0 | 5543.0 |
3142 rows × 190 columns
features6 = features6.groupby('state', as_index=False)['pop2000', 'pop2010', 'pop2012', 'pop2014', 'pop2016'].sum()
# features6 = features6.drop('District of Columbia', 1)
features6
| state | pop2000 | pop2010 | pop2012 | pop2014 | pop2016 | |
|---|---|---|---|---|---|---|
| 0 | Alabama | 4447100.0 | 4779736 | 4813946.0 | 4840037.0 | 4860545.0 |
| 1 | Alaska | 626932.0 | 710231 | 725255.0 | 731131.0 | 735745.0 |
| 2 | Arizona | 5130632.0 | 6392017 | 6544211.0 | 6706435.0 | 6908642.0 |
| 3 | Arkansas | 2673400.0 | 2915918 | 2949208.0 | 2964800.0 | 2988231.0 |
| 4 | California | 33871648.0 | 37253956 | 38019006.0 | 38701278.0 | 39296476.0 |
| 5 | Colorado | 4301261.0 | 5029196 | 5186330.0 | 5342311.0 | 5530105.0 |
| 6 | Connecticut | 3405565.0 | 3574097 | 3597705.0 | 3600188.0 | 3587685.0 |
| 7 | Delaware | 783600.0 | 897934 | 916868.0 | 934805.0 | 952698.0 |
| 8 | District of Columbia | 572059.0 | 601723 | 635630.0 | 660797.0 | 684336.0 |
| 9 | Florida | 15982378.0 | 18801310 | 19341327.0 | 19897747.0 | 20656589.0 |
| 10 | Georgia | 8186453.0 | 9687653 | 9911171.0 | 10083850.0 | 10313620.0 |
| 11 | Hawaii | 1211537.0 | 1360301 | 1392772.0 | 1417710.0 | 1428683.0 |
| 12 | Idaho | 1293953.0 | 1567582 | 1594673.0 | 1630391.0 | 1680026.0 |
| 13 | Illinois | 12419293.0 | 12830632 | 12878494.0 | 12882438.0 | 12835726.0 |
| 14 | Indiana | 6080485.0 | 6483802 | 6535665.0 | 6593182.0 | 6634007.0 |
| 15 | Iowa | 2926324.0 | 3046355 | 3074386.0 | 3105563.0 | 3130869.0 |
| 16 | Kansas | 2688418.0 | 2853118 | 2885316.0 | 2899553.0 | 2907731.0 |
| 17 | Kentucky | 4041769.0 | 4339367 | 4383673.0 | 4410415.0 | 4436113.0 |
| 18 | Louisiana | 4468976.0 | 4533372 | 4602681.0 | 4648797.0 | 4686157.0 |
| 19 | Maine | 1274923.0 | 1328361 | 1328101.0 | 1328903.0 | 1330232.0 |
| 20 | Maryland | 5296486.0 | 5773552 | 5891680.0 | 5970245.0 | 6024752.0 |
| 21 | Massachusetts | 6349097.0 | 6547629 | 6659627.0 | 6757925.0 | 6823721.0 |
| 22 | Michigan | 9938444.0 | 9883640 | 9886610.0 | 9914675.0 | 9933445.0 |
| 23 | Minnesota | 4919479.0 | 5303925 | 5377695.0 | 5452649.0 | 5525050.0 |
| 24 | Mississippi | 2844658.0 | 2967297 | 2982963.0 | 2988578.0 | 2985415.0 |
| 25 | Missouri | 5595211.0 | 5988927 | 6023267.0 | 6058014.0 | 6091176.0 |
| 26 | Montana | 902195.0 | 989415 | 1003522.0 | 1019931.0 | 1038656.0 |
| 27 | Nebraska | 1711263.0 | 1826341 | 1854862.0 | 1880920.0 | 1907603.0 |
| 28 | Nevada | 1998257.0 | 2700551 | 2752410.0 | 2831730.0 | 2939254.0 |
| 29 | New Hampshire | 1235786.0 | 1316470 | 1320923.0 | 1328684.0 | 1335015.0 |
| 30 | New Jersey | 8414350.0 | 8791894 | 8882095.0 | 8943010.0 | 8978416.0 |
| 31 | New Mexico | 1819046.0 | 2059179 | 2083590.0 | 2083207.0 | 2085432.0 |
| 32 | New York | 18976457.0 | 19378102 | 19625409.0 | 19773580.0 | 19836286.0 |
| 33 | North Carolina | 8049313.0 | 9535483 | 9755299.0 | 9941160.0 | 10156689.0 |
| 34 | North Dakota | 642200.0 | 672591 | 701380.0 | 738658.0 | 755548.0 |
| 35 | Ohio | 11353140.0 | 11536504 | 11546969.0 | 11593741.0 | 11622554.0 |
| 36 | Oklahoma | 3450654.0 | 3751351 | 3815298.0 | 3875008.0 | 3921207.0 |
| 37 | Oregon | 3421399.0 | 3831074 | 3893920.0 | 3960673.0 | 4085989.0 |
| 38 | Pennsylvania | 12281054.0 | 12702379 | 12768034.0 | 12790341.0 | 12787085.0 |
| 39 | Rhode Island | 1048319.0 | 1052567 | 1052761.0 | 1054782.0 | 1057566.0 |
| 40 | South Carolina | 4012012.0 | 4625364 | 4719009.0 | 4824758.0 | 4959822.0 |
| 41 | South Dakota | 754844.0 | 814180 | 832576.0 | 849455.0 | 861542.0 |
| 42 | Tennessee | 5689283.0 | 6346105 | 6450632.0 | 6540007.0 | 6649404.0 |
| 43 | Texas | 20851820.0 | 25145561 | 26078327.0 | 26954436.0 | 27904862.0 |
| 44 | Utah | 2233169.0 | 2763885 | 2854222.0 | 2938671.0 | 3044321.0 |
| 45 | Vermont | 608827.0 | 625741 | 625606.0 | 625665.0 | 623354.0 |
| 46 | Virginia | 7067927.0 | 7994802 | 8188656.0 | 8316902.0 | 8414380.0 |
| 47 | Washington | 5894121.0 | 6724540 | 6890899.0 | 7046931.0 | 7280934.0 |
| 48 | West Virginia | 1808344.0 | 1852994 | 1855360.0 | 1847624.0 | 1828637.0 |
| 49 | Wisconsin | 5363675.0 | 5686986 | 5721075.0 | 5751272.0 | 5772917.0 |
| 50 | Wyoming | 493782.0 | 563626 | 576608.0 | 583334.0 | 584910.0 |
features6 = features6[features6['state'] != 'District of Columbia'].reset_index()
features6
| index | state | pop2000 | pop2010 | pop2012 | pop2014 | pop2016 | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | Alabama | 4447100.0 | 4779736 | 4813946.0 | 4840037.0 | 4860545.0 |
| 1 | 1 | Alaska | 626932.0 | 710231 | 725255.0 | 731131.0 | 735745.0 |
| 2 | 2 | Arizona | 5130632.0 | 6392017 | 6544211.0 | 6706435.0 | 6908642.0 |
| 3 | 3 | Arkansas | 2673400.0 | 2915918 | 2949208.0 | 2964800.0 | 2988231.0 |
| 4 | 4 | California | 33871648.0 | 37253956 | 38019006.0 | 38701278.0 | 39296476.0 |
| 5 | 5 | Colorado | 4301261.0 | 5029196 | 5186330.0 | 5342311.0 | 5530105.0 |
| 6 | 6 | Connecticut | 3405565.0 | 3574097 | 3597705.0 | 3600188.0 | 3587685.0 |
| 7 | 7 | Delaware | 783600.0 | 897934 | 916868.0 | 934805.0 | 952698.0 |
| 8 | 9 | Florida | 15982378.0 | 18801310 | 19341327.0 | 19897747.0 | 20656589.0 |
| 9 | 10 | Georgia | 8186453.0 | 9687653 | 9911171.0 | 10083850.0 | 10313620.0 |
| 10 | 11 | Hawaii | 1211537.0 | 1360301 | 1392772.0 | 1417710.0 | 1428683.0 |
| 11 | 12 | Idaho | 1293953.0 | 1567582 | 1594673.0 | 1630391.0 | 1680026.0 |
| 12 | 13 | Illinois | 12419293.0 | 12830632 | 12878494.0 | 12882438.0 | 12835726.0 |
| 13 | 14 | Indiana | 6080485.0 | 6483802 | 6535665.0 | 6593182.0 | 6634007.0 |
| 14 | 15 | Iowa | 2926324.0 | 3046355 | 3074386.0 | 3105563.0 | 3130869.0 |
| 15 | 16 | Kansas | 2688418.0 | 2853118 | 2885316.0 | 2899553.0 | 2907731.0 |
| 16 | 17 | Kentucky | 4041769.0 | 4339367 | 4383673.0 | 4410415.0 | 4436113.0 |
| 17 | 18 | Louisiana | 4468976.0 | 4533372 | 4602681.0 | 4648797.0 | 4686157.0 |
| 18 | 19 | Maine | 1274923.0 | 1328361 | 1328101.0 | 1328903.0 | 1330232.0 |
| 19 | 20 | Maryland | 5296486.0 | 5773552 | 5891680.0 | 5970245.0 | 6024752.0 |
| 20 | 21 | Massachusetts | 6349097.0 | 6547629 | 6659627.0 | 6757925.0 | 6823721.0 |
| 21 | 22 | Michigan | 9938444.0 | 9883640 | 9886610.0 | 9914675.0 | 9933445.0 |
| 22 | 23 | Minnesota | 4919479.0 | 5303925 | 5377695.0 | 5452649.0 | 5525050.0 |
| 23 | 24 | Mississippi | 2844658.0 | 2967297 | 2982963.0 | 2988578.0 | 2985415.0 |
| 24 | 25 | Missouri | 5595211.0 | 5988927 | 6023267.0 | 6058014.0 | 6091176.0 |
| 25 | 26 | Montana | 902195.0 | 989415 | 1003522.0 | 1019931.0 | 1038656.0 |
| 26 | 27 | Nebraska | 1711263.0 | 1826341 | 1854862.0 | 1880920.0 | 1907603.0 |
| 27 | 28 | Nevada | 1998257.0 | 2700551 | 2752410.0 | 2831730.0 | 2939254.0 |
| 28 | 29 | New Hampshire | 1235786.0 | 1316470 | 1320923.0 | 1328684.0 | 1335015.0 |
| 29 | 30 | New Jersey | 8414350.0 | 8791894 | 8882095.0 | 8943010.0 | 8978416.0 |
| 30 | 31 | New Mexico | 1819046.0 | 2059179 | 2083590.0 | 2083207.0 | 2085432.0 |
| 31 | 32 | New York | 18976457.0 | 19378102 | 19625409.0 | 19773580.0 | 19836286.0 |
| 32 | 33 | North Carolina | 8049313.0 | 9535483 | 9755299.0 | 9941160.0 | 10156689.0 |
| 33 | 34 | North Dakota | 642200.0 | 672591 | 701380.0 | 738658.0 | 755548.0 |
| 34 | 35 | Ohio | 11353140.0 | 11536504 | 11546969.0 | 11593741.0 | 11622554.0 |
| 35 | 36 | Oklahoma | 3450654.0 | 3751351 | 3815298.0 | 3875008.0 | 3921207.0 |
| 36 | 37 | Oregon | 3421399.0 | 3831074 | 3893920.0 | 3960673.0 | 4085989.0 |
| 37 | 38 | Pennsylvania | 12281054.0 | 12702379 | 12768034.0 | 12790341.0 | 12787085.0 |
| 38 | 39 | Rhode Island | 1048319.0 | 1052567 | 1052761.0 | 1054782.0 | 1057566.0 |
| 39 | 40 | South Carolina | 4012012.0 | 4625364 | 4719009.0 | 4824758.0 | 4959822.0 |
| 40 | 41 | South Dakota | 754844.0 | 814180 | 832576.0 | 849455.0 | 861542.0 |
| 41 | 42 | Tennessee | 5689283.0 | 6346105 | 6450632.0 | 6540007.0 | 6649404.0 |
| 42 | 43 | Texas | 20851820.0 | 25145561 | 26078327.0 | 26954436.0 | 27904862.0 |
| 43 | 44 | Utah | 2233169.0 | 2763885 | 2854222.0 | 2938671.0 | 3044321.0 |
| 44 | 45 | Vermont | 608827.0 | 625741 | 625606.0 | 625665.0 | 623354.0 |
| 45 | 46 | Virginia | 7067927.0 | 7994802 | 8188656.0 | 8316902.0 | 8414380.0 |
| 46 | 47 | Washington | 5894121.0 | 6724540 | 6890899.0 | 7046931.0 | 7280934.0 |
| 47 | 48 | West Virginia | 1808344.0 | 1852994 | 1855360.0 | 1847624.0 | 1828637.0 |
| 48 | 49 | Wisconsin | 5363675.0 | 5686986 | 5721075.0 | 5751272.0 | 5772917.0 |
| 49 | 50 | Wyoming | 493782.0 | 563626 | 576608.0 | 583334.0 | 584910.0 |
df['population'] = 0
df
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 0 |
| 1 | 1976 | ALASKA | 117916 | 0 |
| 2 | 1976 | ARIZONA | 729002 | 0 |
| 3 | 1976 | ARKANSAS | 336383 | 0 |
| 4 | 1976 | CALIFORNIA | 7442501 | 0 |
| ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 0 |
| 1147 | 2020 | WASHINGTON | 3885792 | 0 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 0 |
| 1149 | 2020 | WISCONSIN | 3235981 | 0 |
| 1150 | 2020 | WYOMING | 270367 | 0 |
1151 rows × 4 columns
df.loc[df['year'] == 2000, 'population'] = features6['pop2000'].values
df.loc[df['year'] == 2010, 'population'] = features6['pop2010'].values
df.loc[df['year'] == 2012, 'population'] = features6['pop2012'].values
df.loc[df['year'] == 2014, 'population'] = features6['pop2014'].values
df.loc[df['year'] == 2016, 'population'] = features6['pop2016'].values
df[df['year']==2000]
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 600 | 2000 | ALABAMA | 1431797 | 4447100 |
| 601 | 2000 | ALASKA | 273561 | 626932 |
| 602 | 2000 | ARIZONA | 1465656 | 5130632 |
| 603 | 2000 | ARKANSAS | 632513 | 2673400 |
| 604 | 2000 | CALIFORNIA | 10436607 | 33871648 |
| 605 | 2000 | COLORADO | 1619335 | 4301261 |
| 606 | 2000 | CONNECTICUT | 1313483 | 3405565 |
| 607 | 2000 | DELAWARE | 313126 | 783600 |
| 608 | 2000 | FLORIDA | 5009584 | 15982378 |
| 609 | 2000 | GEORGIA | 2416422 | 8186453 |
| 610 | 2000 | HAWAII | 340424 | 1211537 |
| 611 | 2000 | IDAHO | 492835 | 1293953 |
| 612 | 2000 | ILLINOIS | 4393184 | 12419293 |
| 613 | 2000 | INDIANA | 2156456 | 6080485 |
| 614 | 2000 | IOWA | 1275449 | 2926324 |
| 615 | 2000 | KANSAS | 1035961 | 2688418 |
| 616 | 2000 | KENTUCKY | 1435409 | 4041769 |
| 617 | 2000 | LOUISIANA | 1202172 | 4468976 |
| 618 | 2000 | MAINE | 638399 | 1274923 |
| 619 | 2000 | MARYLAND | 1924669 | 5296486 |
| 620 | 2000 | MASSACHUSETTS | 2334148 | 6349097 |
| 621 | 2000 | MICHIGAN | 4069625 | 9938444 |
| 622 | 2000 | MINNESOTA | 2363738 | 4919479 |
| 623 | 2000 | MISSISSIPPI | 986139 | 2844658 |
| 624 | 2000 | MISSOURI | 2325788 | 5595211 |
| 625 | 2000 | MONTANA | 410521 | 902195 |
| 626 | 2000 | NEBRASKA | 682496 | 1711263 |
| 627 | 2000 | NEVADA | 585204 | 1998257 |
| 628 | 2000 | NEW HAMPSHIRE | 556049 | 1235786 |
| 629 | 2000 | NEW JERSEY | 2988233 | 8414350 |
| 630 | 2000 | NEW MEXICO | 587514 | 1819046 |
| 631 | 2000 | NEW YORK | 5813877 | 18976457 |
| 632 | 2000 | NORTH CAROLINA | 2779168 | 8049313 |
| 633 | 2000 | NORTH DAKOTA | 285636 | 642200 |
| 634 | 2000 | OHIO | 4585038 | 11353140 |
| 635 | 2000 | OKLAHOMA | 1087515 | 3450654 |
| 636 | 2000 | OREGON | 1437425 | 3421399 |
| 637 | 2000 | PENNSYLVANIA | 4552010 | 12281054 |
| 638 | 2000 | RHODE ISLAND | 383862 | 1048319 |
| 639 | 2000 | SOUTH CAROLINA | 1320594 | 4012012 |
| 640 | 2000 | SOUTH DAKOTA | 314761 | 754844 |
| 641 | 2000 | TENNESSEE | 1852935 | 5689283 |
| 642 | 2000 | TEXAS | 5982441 | 20851820 |
| 643 | 2000 | UTAH | 758754 | 2233169 |
| 644 | 2000 | VERMONT | 282606 | 608827 |
| 645 | 2000 | VIRGINIA | 2412724 | 7067927 |
| 646 | 2000 | WASHINGTON | 2382411 | 5894121 |
| 647 | 2000 | WEST VIRGINIA | 579872 | 1808344 |
| 648 | 2000 | WISCONSIN | 2503018 | 5363675 |
| 649 | 2000 | WYOMING | 212312 | 493782 |
((features6['pop2012'] / features6['pop2010']).mean() - 1) * 100
1.5146648329240398
by average every 2 years the population increases by 1.5% which means it also should decrease by 1.5% going backward
i = 1998
tmp = features6['pop2000'].values
while i >= 1976:
tmp = tmp * (1-0.015)
df.loc[df['year'] == i, 'population'] = np.floor(tmp)
i = i - 2
df
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 |
| 1 | 1976 | ALASKA | 117916 | 522944 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 |
| ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 0 |
| 1147 | 2020 | WASHINGTON | 3885792 | 0 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 0 |
| 1149 | 2020 | WISCONSIN | 3235981 | 0 |
| 1150 | 2020 | WYOMING | 270367 | 0 |
1151 rows × 4 columns
i = 2002
tmp = features6['pop2000'].values
while i < 2010:
tmp = tmp * (1+0.015)
df.loc[df['year'] == i, 'population'] = np.floor(tmp)
i = i + 2
df[(df['year'] >= 2002) & (df['year'] < 2010)]
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 650 | 2002 | ALABAMA | 1266562 | 4513806 |
| 651 | 2002 | ALASKA | 227274 | 636335 |
| 652 | 2002 | ARIZONA | 1194365 | 5207591 |
| 653 | 2002 | ARKANSAS | 675825 | 2713500 |
| 654 | 2002 | CALIFORNIA | 7256415 | 34379722 |
| ... | ... | ... | ... | ... |
| 845 | 2008 | VIRGINIA | 3476710 | 7501640 |
| 846 | 2008 | WASHINGTON | 2914463 | 6255805 |
| 847 | 2008 | WEST VIRGINIA | 645414 | 1919310 |
| 848 | 2008 | WISCONSIN | 2770362 | 5692809 |
| 849 | 2008 | WYOMING | 249032 | 524082 |
200 rows × 4 columns
df = df[df['state'] != 'DISTRICT OF COLUMBIA']
df[df['year'] ==2020]
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 1100 | 2020 | ALABAMA | 2024821 | 0 |
| 1101 | 2020 | ALASKA | 351982 | 0 |
| 1102 | 2020 | ARIZONA | 3268249 | 0 |
| 1103 | 2020 | ARKANSAS | 1179396 | 0 |
| 1104 | 2020 | CALIFORNIA | 16724901 | 0 |
| 1105 | 2020 | COLORADO | 3164950 | 0 |
| 1106 | 2020 | CONNECTICUT | 1772927 | 0 |
| 1107 | 2020 | DELAWARE | 488270 | 0 |
| 1109 | 2020 | FLORIDA | 10464790 | 0 |
| 1110 | 2020 | GEORGIA | 4883611 | 0 |
| 1111 | 2020 | HAWAII | 526535 | 0 |
| 1112 | 2020 | IDAHO | 849909 | 0 |
| 1113 | 2020 | ILLINOIS | 5876819 | 0 |
| 1114 | 2020 | INDIANA | 2996444 | 0 |
| 1115 | 2020 | IOWA | 1637050 | 0 |
| 1116 | 2020 | KANSAS | 1358953 | 0 |
| 1117 | 2020 | KENTUCKY | 2115895 | 0 |
| 1118 | 2020 | LOUISIANA | 2147135 | 0 |
| 1119 | 2020 | MAINE | 809214 | 0 |
| 1120 | 2020 | MARYLAND | 2947893 | 0 |
| 1121 | 2020 | MASSACHUSETTS | 3304030 | 0 |
| 1122 | 2020 | MICHIGAN | 5423140 | 0 |
| 1123 | 2020 | MINNESOTA | 3189307 | 0 |
| 1124 | 2020 | MISSISSIPPI | 1227846 | 0 |
| 1125 | 2020 | MISSOURI | 2973421 | 0 |
| 1126 | 2020 | MONTANA | 601509 | 0 |
| 1127 | 2020 | NEBRASKA | 941298 | 0 |
| 1128 | 2020 | NEVADA | 1355607 | 0 |
| 1129 | 2020 | NEW HAMPSHIRE | 786806 | 0 |
| 1130 | 2020 | NEW JERSEY | 4432923 | 0 |
| 1131 | 2020 | NEW MEXICO | 903684 | 0 |
| 1132 | 2020 | NEW YORK | 8216413 | 0 |
| 1133 | 2020 | NORTH CAROLINA | 5324819 | 0 |
| 1134 | 2020 | NORTH DAKOTA | 355223 | 0 |
| 1135 | 2020 | OHIO | 5761540 | 0 |
| 1136 | 2020 | OKLAHOMA | 1551383 | 0 |
| 1137 | 2020 | OREGON | 2304718 | 0 |
| 1138 | 2020 | PENNSYLVANIA | 6779307 | 0 |
| 1139 | 2020 | RHODE ISLAND | 486287 | 0 |
| 1140 | 2020 | SOUTH CAROLINA | 2503382 | 0 |
| 1141 | 2020 | SOUTH DAKOTA | 397732 | 0 |
| 1142 | 2020 | TENNESSEE | 2841744 | 0 |
| 1143 | 2020 | TEXAS | 11093626 | 0 |
| 1144 | 2020 | UTAH | 1432232 | 0 |
| 1145 | 2020 | VERMONT | 354295 | 0 |
| 1146 | 2020 | VIRGINIA | 4310779 | 0 |
| 1147 | 2020 | WASHINGTON | 3885792 | 0 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 0 |
| 1149 | 2020 | WISCONSIN | 3235981 | 0 |
| 1150 | 2020 | WYOMING | 270367 | 0 |
i = 2018
tmp = features6['pop2016'].values
while i <= 2020:
tmp = tmp * (1+0.015)
df.loc[df['year'] == i, 'population'] = np.floor(tmp)
i = i + 2
df
| year | state | totalVotes | population | |
|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 |
| 1 | 1976 | ALASKA | 117916 | 522944 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 |
| ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 |
| 1150 | 2020 | WYOMING | 270367 | 602588 |
1150 rows × 4 columns
df['VAP'] = 0
df.loc[df['year'] == 2010, 'VAP'] = county_complete2['VAP2010'].values
df.loc[df['year'] == 2012, 'VAP'] = county_complete2['VAP2012'].values
(df.loc[df['year'] == 2012, 'VAP'].values / df.loc[df['year'] == 2010, 'VAP'].values).mean()
1.0185722751685544
looking at the result above we can assume that the VAP icreases every 2 years by 1.8% which means if we backtrack in years the decrease will be also 1.8%
i = 2008
tmp = county_complete2['VAP2010'].values
while i >= 1976:
tmp = tmp * (1-0.018)
df.loc[df['year'] == i, 'VAP'] = np.floor(tmp)
i = i - 2
df
| year | state | totalVotes | population | VAP | |
|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 |
| ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 0 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 0 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 0 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 0 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 0 |
1150 rows × 5 columns
i = 2014
tmp = county_complete2['VAP2012'].values
while i <= 2020:
tmp = tmp * (1+0.018)
df.loc[df['year'] == i, 'VAP'] = np.floor(tmp)
i = i + 2
df
| year | state | totalVotes | population | VAP | |
|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 |
| ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 6774559 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 5679975 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 1578350 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 4715247 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 471144 |
1150 rows × 5 columns
df['can_vote_percent'] = df['VAP'] / df['population']
df['voter_turnout'] = df['totalVotes'] / df['VAP']
df
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | |
|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 | 0.721967 | 0.367481 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 | 0.734065 | 0.307173 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 | 0.817301 | 0.208421 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 | 0.725888 | 0.207810 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 | 0.726585 | 0.362544 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 6774559 | 0.781496 | 0.636319 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 5679975 | 0.757229 | 0.684121 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 1578350 | 0.837807 | 0.482393 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 4715247 | 0.792825 | 0.686280 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 471144 | 0.781868 | 0.573852 |
1150 rows × 7 columns
we can assume that the percent of women is always constant over the years and doesnt change
county_complete['female_num'] = np.floor(county_complete['pop2010'] * (county_complete['female_2010'] / 100))
county_complete
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | VAP2010 | VAP2012 | female_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 | 39945.0 | 40536.0 | 27994.0 |
| 1 | 01003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 | 140344.0 | 146831.0 | 93137.0 |
| 2 | 01005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 | 21443.0 | 21278.0 | 12877.0 |
| 3 | 01007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 | 17713.0 | 17609.0 | 10609.0 |
| 4 | 01009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 | 43220.0 | 43587.0 | 28947.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 | 31934.0 | 32889.0 | 20939.0 |
| 3138 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 | 17226.0 | 17562.0 | 10093.0 |
| 3139 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 | 14740.0 | 14699.0 | 10453.0 |
| 3140 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 | 6357.0 | 6288.0 | 4275.0 |
| 3141 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 | 5636.0 | 5543.0 | 3416.0 |
3142 rows × 191 columns
county_complete3 = county_complete.groupby('state',as_index=False)['female_num'].sum()
county_complete3 = county_complete3[county_complete3['state'] != 'District of Columbia'].reset_index()
county_complete3
| index | state | female_num | |
|---|---|---|---|
| 0 | 0 | Alabama | 2459237.0 |
| 1 | 1 | Alaska | 340578.0 |
| 2 | 2 | Arizona | 3215163.0 |
| 3 | 3 | Arkansas | 1484149.0 |
| 4 | 4 | California | 18730716.0 |
| 5 | 5 | Colorado | 2509043.0 |
| 6 | 6 | Connecticut | 1835148.0 |
| 7 | 7 | Delaware | 463031.0 |
| 8 | 9 | Florida | 9613839.0 |
| 9 | 10 | Georgia | 4957744.0 |
| 10 | 11 | Hawaii | 678539.0 |
| 11 | 12 | Idaho | 782011.0 |
| 12 | 13 | Illinois | 6538181.0 |
| 13 | 14 | Indiana | 3294306.0 |
| 14 | 15 | Iowa | 1538196.0 |
| 15 | 16 | Kansas | 1437803.0 |
| 16 | 17 | Kentucky | 2204181.0 |
| 17 | 18 | Louisiana | 2313889.0 |
| 18 | 19 | Maine | 678503.0 |
| 19 | 20 | Maryland | 2980796.0 |
| 20 | 21 | Massachusetts | 3381707.0 |
| 21 | 22 | Michigan | 5036146.0 |
| 22 | 23 | Minnesota | 2671946.0 |
| 23 | 24 | Mississippi | 1526108.0 |
| 24 | 25 | Missouri | 3055921.0 |
| 25 | 26 | Montana | 492719.0 |
| 26 | 27 | Nebraska | 919746.0 |
| 27 | 28 | Nevada | 1337536.0 |
| 28 | 29 | New Hampshire | 666927.0 |
| 29 | 30 | New Jersey | 4512482.0 |
| 30 | 31 | New Mexico | 1041638.0 |
| 31 | 32 | New York | 10001353.0 |
| 32 | 33 | North Carolina | 4889810.0 |
| 33 | 34 | North Dakota | 332680.0 |
| 34 | 35 | Ohio | 5904814.0 |
| 35 | 36 | Oklahoma | 1894345.0 |
| 36 | 37 | Oregon | 1934471.0 |
| 37 | 38 | Pennsylvania | 6511648.0 |
| 38 | 39 | Rhode Island | 543862.0 |
| 39 | 40 | South Carolina | 2375144.0 |
| 40 | 41 | South Dakota | 406841.0 |
| 41 | 42 | Tennessee | 3252774.0 |
| 42 | 43 | Texas | 12675500.0 |
| 43 | 44 | Utah | 1375645.0 |
| 44 | 45 | Vermont | 317597.0 |
| 45 | 46 | Virginia | 4071357.0 |
| 46 | 47 | Washington | 3375652.0 |
| 47 | 48 | West Virginia | 939524.0 |
| 48 | 49 | Wisconsin | 2864033.0 |
| 49 | 50 | Wyoming | 276192.0 |
county_complete3['female_percent'] = county_complete3['female_num'] / features6['pop2010']
county_complete3
| index | state | female_num | female_percent | |
|---|---|---|---|---|
| 0 | 0 | Alabama | 2459237.0 | 0.514513 |
| 1 | 1 | Alaska | 340578.0 | 0.479531 |
| 2 | 2 | Arizona | 3215163.0 | 0.502997 |
| 3 | 3 | Arkansas | 1484149.0 | 0.508982 |
| 4 | 4 | California | 18730716.0 | 0.502785 |
| 5 | 5 | Colorado | 2509043.0 | 0.498895 |
| 6 | 6 | Connecticut | 1835148.0 | 0.513458 |
| 7 | 7 | Delaware | 463031.0 | 0.515663 |
| 8 | 9 | Florida | 9613839.0 | 0.511339 |
| 9 | 10 | Georgia | 4957744.0 | 0.511759 |
| 10 | 11 | Hawaii | 678539.0 | 0.498815 |
| 11 | 12 | Idaho | 782011.0 | 0.498864 |
| 12 | 13 | Illinois | 6538181.0 | 0.509576 |
| 13 | 14 | Indiana | 3294306.0 | 0.508082 |
| 14 | 15 | Iowa | 1538196.0 | 0.504930 |
| 15 | 16 | Kansas | 1437803.0 | 0.503941 |
| 16 | 17 | Kentucky | 2204181.0 | 0.507950 |
| 17 | 18 | Louisiana | 2313889.0 | 0.510412 |
| 18 | 19 | Maine | 678503.0 | 0.510782 |
| 19 | 20 | Maryland | 2980796.0 | 0.516285 |
| 20 | 21 | Massachusetts | 3381707.0 | 0.516478 |
| 21 | 22 | Michigan | 5036146.0 | 0.509544 |
| 22 | 23 | Minnesota | 2671946.0 | 0.503768 |
| 23 | 24 | Mississippi | 1526108.0 | 0.514309 |
| 24 | 25 | Missouri | 3055921.0 | 0.510262 |
| 25 | 26 | Montana | 492719.0 | 0.497990 |
| 26 | 27 | Nebraska | 919746.0 | 0.503600 |
| 27 | 28 | Nevada | 1337536.0 | 0.495283 |
| 28 | 29 | New Hampshire | 666927.0 | 0.506603 |
| 29 | 30 | New Jersey | 4512482.0 | 0.513255 |
| 30 | 31 | New Mexico | 1041638.0 | 0.505851 |
| 31 | 32 | New York | 10001353.0 | 0.516116 |
| 32 | 33 | North Carolina | 4889810.0 | 0.512802 |
| 33 | 34 | North Dakota | 332680.0 | 0.494625 |
| 34 | 35 | Ohio | 5904814.0 | 0.511837 |
| 35 | 36 | Oklahoma | 1894345.0 | 0.504977 |
| 36 | 37 | Oregon | 1934471.0 | 0.504942 |
| 37 | 38 | Pennsylvania | 6511648.0 | 0.512632 |
| 38 | 39 | Rhode Island | 543862.0 | 0.516701 |
| 39 | 40 | South Carolina | 2375144.0 | 0.513504 |
| 40 | 41 | South Dakota | 406841.0 | 0.499694 |
| 41 | 42 | Tennessee | 3252774.0 | 0.512562 |
| 42 | 43 | Texas | 12675500.0 | 0.504085 |
| 43 | 44 | Utah | 1375645.0 | 0.497722 |
| 44 | 45 | Vermont | 317597.0 | 0.507553 |
| 45 | 46 | Virginia | 4071357.0 | 0.509251 |
| 46 | 47 | Washington | 3375652.0 | 0.501990 |
| 47 | 48 | West Virginia | 939524.0 | 0.507030 |
| 48 | 49 | Wisconsin | 2864033.0 | 0.503612 |
| 49 | 50 | Wyoming | 276192.0 | 0.490027 |
df['female_percent'] = 0
i = 1976
tmp = county_complete3['female_percent'].values
while i <= 2020:
df.loc[df['year'] == i, 'female_percent'] = tmp
i = i + 2
df
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 | 0.721967 | 0.367481 | 0.514513 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 | 0.734065 | 0.307173 | 0.479531 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 | 0.817301 | 0.208421 | 0.502997 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 | 0.725888 | 0.207810 | 0.508982 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 | 0.726585 | 0.362544 | 0.502785 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 6774559 | 0.781496 | 0.636319 | 0.509251 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 5679975 | 0.757229 | 0.684121 | 0.501990 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 1578350 | 0.837807 | 0.482393 | 0.507030 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 4715247 | 0.792825 | 0.686280 | 0.503612 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 471144 | 0.781868 | 0.573852 | 0.490027 |
1150 rows × 8 columns
county_complete['bachelors_sum_2017'] = np.floor(county_complete['pop2017'] * (county_complete['bachelors_2017'] / 100))
county_complete['bachelors_sum_2016'] = np.floor(county_complete['pop2016'] * (county_complete['bachelors_2016'] / 100))
county_complete
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | VAP2010 | VAP2012 | female_num | bachelors_sum_2017 | bachelors_sum_2016 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 | 39945.0 | 40536.0 | 27994.0 | 13876.0 | 13598.0 |
| 1 | 01003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 | 140344.0 | 146831.0 | 93137.0 | 65276.0 | 61215.0 |
| 2 | 01005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 | 21443.0 | 21278.0 | 12877.0 | 3032.0 | 3324.0 |
| 3 | 01007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 | 17713.0 | 17609.0 | 10609.0 | 2992.0 | 2715.0 |
| 4 | 01009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 | 43220.0 | 43587.0 | 28947.0 | 7599.0 | 7540.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 | 31934.0 | 32889.0 | 20939.0 | 9664.0 | 9468.0 |
| 3138 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 | 17226.0 | 17562.0 | 10093.0 | 12586.0 | 12470.0 |
| 3139 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 | 14740.0 | 14699.0 | 10453.0 | 3566.0 | 3769.0 |
| 3140 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 | 6357.0 | 6288.0 | 4275.0 | 1693.0 | 1727.0 |
| 3141 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 | 5636.0 | 5543.0 | 3416.0 | 1371.0 | 1353.0 |
3142 rows × 193 columns
county_complete4 = county_complete.groupby('state',as_index=False)['bachelors_sum_2016','bachelors_sum_2017', 'pop2017', 'pop2016'].sum()
county_complete4 = county_complete4[county_complete4['state'] != 'District of Columbia'].reset_index()
county_complete4
| index | state | bachelors_sum_2016 | bachelors_sum_2017 | pop2017 | pop2016 | |
|---|---|---|---|---|---|---|
| 0 | 0 | Alabama | 1177586.0 | 1206184.0 | 4874747.0 | 4860545.0 |
| 1 | 1 | Alaska | 209941.0 | 210800.0 | 733972.0 | 735745.0 |
| 2 | 2 | Arizona | 1940984.0 | 2001339.0 | 7016270.0 | 6908642.0 |
| 3 | 3 | Arkansas | 651955.0 | 671790.0 | 3004279.0 | 2988231.0 |
| 4 | 4 | California | 12423475.0 | 12706851.0 | 39536653.0 | 39296476.0 |
| 5 | 5 | Colorado | 2140208.0 | 2209034.0 | 5607154.0 | 5530105.0 |
| 6 | 6 | Connecticut | 1368712.0 | 1381762.0 | 3588184.0 | 3587685.0 |
| 7 | 7 | Delaware | 290449.0 | 297954.0 | 961939.0 | 952698.0 |
| 8 | 9 | Florida | 5779295.0 | 5987483.0 | 20984400.0 | 20656589.0 |
| 9 | 10 | Georgia | 3049269.0 | 3136717.0 | 10429379.0 | 10313620.0 |
| 10 | 11 | Hawaii | 448897.0 | 457543.0 | 1427538.0 | 1428683.0 |
| 11 | 12 | Idaho | 442734.0 | 460636.0 | 1716943.0 | 1680026.0 |
| 12 | 13 | Illinois | 4247707.0 | 4302283.0 | 12802023.0 | 12835726.0 |
| 13 | 14 | Indiana | 1656191.0 | 1711925.0 | 6666818.0 | 6634007.0 |
| 14 | 15 | Iowa | 869795.0 | 889392.0 | 3145711.0 | 3130869.0 |
| 15 | 16 | Kansas | 925703.0 | 948527.0 | 2913123.0 | 2907731.0 |
| 16 | 17 | Kentucky | 1018507.0 | 1045038.0 | 4454189.0 | 4436113.0 |
| 17 | 18 | Louisiana | 1079751.0 | 1099420.0 | 4684333.0 | 4686157.0 |
| 18 | 19 | Maine | 391004.0 | 405369.0 | 1335907.0 | 1330232.0 |
| 19 | 20 | Maryland | 2317761.0 | 2364163.0 | 6052177.0 | 6024752.0 |
| 20 | 21 | Massachusetts | 2810023.0 | 2884571.0 | 6859819.0 | 6823721.0 |
| 21 | 22 | Michigan | 2746139.0 | 2819725.0 | 9962311.0 | 9933445.0 |
| 22 | 23 | Minnesota | 1898587.0 | 1943724.0 | 5576606.0 | 5525050.0 |
| 23 | 24 | Mississippi | 635003.0 | 644336.0 | 2984100.0 | 2985415.0 |
| 24 | 25 | Missouri | 1691739.0 | 1730133.0 | 6113532.0 | 6091176.0 |
| 25 | 26 | Montana | 313942.0 | 326078.0 | 1050493.0 | 1038656.0 |
| 26 | 27 | Nebraska | 578814.0 | 593679.0 | 1920076.0 | 1907603.0 |
| 27 | 28 | Nevada | 681648.0 | 711241.0 | 2998039.0 | 2939254.0 |
| 28 | 29 | New Hampshire | 474475.0 | 484683.0 | 1342795.0 | 1335015.0 |
| 29 | 30 | New Jersey | 3358033.0 | 3424183.0 | 9005644.0 | 8978416.0 |
| 30 | 31 | New Mexico | 549578.0 | 555094.0 | 2088070.0 | 2085432.0 |
| 31 | 32 | New York | 6842027.0 | 6957052.0 | 19849399.0 | 19836286.0 |
| 32 | 33 | North Carolina | 2986382.0 | 3104071.0 | 10273419.0 | 10156689.0 |
| 33 | 34 | North Dakota | 215489.0 | 220887.0 | 755393.0 | 755548.0 |
| 34 | 35 | Ohio | 3122615.0 | 3198644.0 | 11658609.0 | 11622554.0 |
| 35 | 36 | Oklahoma | 969557.0 | 985379.0 | 3930864.0 | 3921207.0 |
| 36 | 37 | Oregon | 1288213.0 | 1338134.0 | 4142776.0 | 4085989.0 |
| 37 | 38 | Pennsylvania | 3764539.0 | 3863560.0 | 12805537.0 | 12787085.0 |
| 38 | 39 | Rhode Island | 342406.0 | 348216.0 | 1059639.0 | 1057566.0 |
| 39 | 40 | South Carolina | 1319135.0 | 1363079.0 | 5024369.0 | 4959822.0 |
| 40 | 41 | South Dakota | 236484.0 | 242101.0 | 869666.0 | 861542.0 |
| 41 | 42 | Tennessee | 1709642.0 | 1772301.0 | 6715984.0 | 6649404.0 |
| 42 | 43 | Texas | 7865606.0 | 8139329.0 | 28304596.0 | 27904862.0 |
| 43 | 44 | Utah | 972715.0 | 1018490.0 | 3101833.0 | 3044321.0 |
| 44 | 45 | Vermont | 227717.0 | 231512.0 | 623657.0 | 623354.0 |
| 45 | 46 | Virginia | 3128649.0 | 3204193.0 | 8470020.0 | 8414380.0 |
| 46 | 47 | Washington | 2429313.0 | 2532132.0 | 7405743.0 | 7280934.0 |
| 47 | 48 | West Virginia | 363285.0 | 366421.0 | 1815857.0 | 1828637.0 |
| 48 | 49 | Wisconsin | 1650746.0 | 1694242.0 | 5795483.0 | 5772917.0 |
| 49 | 50 | Wyoming | 152941.0 | 156015.0 | 579315.0 | 584910.0 |
county_complete4['bachelors_percent_2016'] = county_complete4['bachelors_sum_2016'] / county_complete4['pop2016']
county_complete4['bachelors_percent_2017'] = county_complete4['bachelors_sum_2017'] / county_complete4['pop2017']
county_complete4
| index | state | bachelors_sum_2016 | bachelors_sum_2017 | pop2017 | pop2016 | bachelors_percent_2016 | bachelors_percent_2017 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Alabama | 1177586.0 | 1206184.0 | 4874747.0 | 4860545.0 | 0.242274 | 0.247435 |
| 1 | 1 | Alaska | 209941.0 | 210800.0 | 733972.0 | 735745.0 | 0.285345 | 0.287204 |
| 2 | 2 | Arizona | 1940984.0 | 2001339.0 | 7016270.0 | 6908642.0 | 0.280950 | 0.285243 |
| 3 | 3 | Arkansas | 651955.0 | 671790.0 | 3004279.0 | 2988231.0 | 0.218174 | 0.223611 |
| 4 | 4 | California | 12423475.0 | 12706851.0 | 39536653.0 | 39296476.0 | 0.316147 | 0.321394 |
| 5 | 5 | Colorado | 2140208.0 | 2209034.0 | 5607154.0 | 5530105.0 | 0.387010 | 0.393967 |
| 6 | 6 | Connecticut | 1368712.0 | 1381762.0 | 3588184.0 | 3587685.0 | 0.381503 | 0.385087 |
| 7 | 7 | Delaware | 290449.0 | 297954.0 | 961939.0 | 952698.0 | 0.304870 | 0.309743 |
| 8 | 9 | Florida | 5779295.0 | 5987483.0 | 20984400.0 | 20656589.0 | 0.279780 | 0.285330 |
| 9 | 10 | Georgia | 3049269.0 | 3136717.0 | 10429379.0 | 10313620.0 | 0.295655 | 0.300758 |
| 10 | 11 | Hawaii | 448897.0 | 457543.0 | 1427538.0 | 1428683.0 | 0.314203 | 0.320512 |
| 11 | 12 | Idaho | 442734.0 | 460636.0 | 1716943.0 | 1680026.0 | 0.263528 | 0.268288 |
| 12 | 13 | Illinois | 4247707.0 | 4302283.0 | 12802023.0 | 12835726.0 | 0.330928 | 0.336063 |
| 13 | 14 | Indiana | 1656191.0 | 1711925.0 | 6666818.0 | 6634007.0 | 0.249652 | 0.256783 |
| 14 | 15 | Iowa | 869795.0 | 889392.0 | 3145711.0 | 3130869.0 | 0.277813 | 0.282732 |
| 15 | 16 | Kansas | 925703.0 | 948527.0 | 2913123.0 | 2907731.0 | 0.318359 | 0.325605 |
| 16 | 17 | Kentucky | 1018507.0 | 1045038.0 | 4454189.0 | 4436113.0 | 0.229594 | 0.234619 |
| 17 | 18 | Louisiana | 1079751.0 | 1099420.0 | 4684333.0 | 4686157.0 | 0.230413 | 0.234702 |
| 18 | 19 | Maine | 391004.0 | 405369.0 | 1335907.0 | 1330232.0 | 0.293937 | 0.303441 |
| 19 | 20 | Maryland | 2317761.0 | 2364163.0 | 6052177.0 | 6024752.0 | 0.384706 | 0.390630 |
| 20 | 21 | Massachusetts | 2810023.0 | 2884571.0 | 6859819.0 | 6823721.0 | 0.411802 | 0.420502 |
| 21 | 22 | Michigan | 2746139.0 | 2819725.0 | 9962311.0 | 9933445.0 | 0.276454 | 0.283039 |
| 22 | 23 | Minnesota | 1898587.0 | 1943724.0 | 5576606.0 | 5525050.0 | 0.343633 | 0.348550 |
| 23 | 24 | Mississippi | 635003.0 | 644336.0 | 2984100.0 | 2985415.0 | 0.212702 | 0.215923 |
| 24 | 25 | Missouri | 1691739.0 | 1730133.0 | 6113532.0 | 6091176.0 | 0.277736 | 0.283001 |
| 25 | 26 | Montana | 313942.0 | 326078.0 | 1050493.0 | 1038656.0 | 0.302258 | 0.310405 |
| 26 | 27 | Nebraska | 578814.0 | 593679.0 | 1920076.0 | 1907603.0 | 0.303425 | 0.309196 |
| 27 | 28 | Nevada | 681648.0 | 711241.0 | 2998039.0 | 2939254.0 | 0.231912 | 0.237235 |
| 28 | 29 | New Hampshire | 474475.0 | 484683.0 | 1342795.0 | 1335015.0 | 0.355408 | 0.360951 |
| 29 | 30 | New Jersey | 3358033.0 | 3424183.0 | 9005644.0 | 8978416.0 | 0.374012 | 0.380226 |
| 30 | 31 | New Mexico | 549578.0 | 555094.0 | 2088070.0 | 2085432.0 | 0.263532 | 0.265841 |
| 31 | 32 | New York | 6842027.0 | 6957052.0 | 19849399.0 | 19836286.0 | 0.344925 | 0.350492 |
| 32 | 33 | North Carolina | 2986382.0 | 3104071.0 | 10273419.0 | 10156689.0 | 0.294031 | 0.302146 |
| 33 | 34 | North Dakota | 215489.0 | 220887.0 | 755393.0 | 755548.0 | 0.285209 | 0.292413 |
| 34 | 35 | Ohio | 3122615.0 | 3198644.0 | 11658609.0 | 11622554.0 | 0.268669 | 0.274359 |
| 35 | 36 | Oklahoma | 969557.0 | 985379.0 | 3930864.0 | 3921207.0 | 0.247260 | 0.250677 |
| 36 | 37 | Oregon | 1288213.0 | 1338134.0 | 4142776.0 | 4085989.0 | 0.315276 | 0.323004 |
| 37 | 38 | Pennsylvania | 3764539.0 | 3863560.0 | 12805537.0 | 12787085.0 | 0.294402 | 0.301710 |
| 38 | 39 | Rhode Island | 342406.0 | 348216.0 | 1059639.0 | 1057566.0 | 0.323768 | 0.328618 |
| 39 | 40 | South Carolina | 1319135.0 | 1363079.0 | 5024369.0 | 4959822.0 | 0.265964 | 0.271294 |
| 40 | 41 | South Dakota | 236484.0 | 242101.0 | 869666.0 | 861542.0 | 0.274489 | 0.278384 |
| 41 | 42 | Tennessee | 1709642.0 | 1772301.0 | 6715984.0 | 6649404.0 | 0.257112 | 0.263893 |
| 42 | 43 | Texas | 7865606.0 | 8139329.0 | 28304596.0 | 27904862.0 | 0.281872 | 0.287562 |
| 43 | 44 | Utah | 972715.0 | 1018490.0 | 3101833.0 | 3044321.0 | 0.319518 | 0.328351 |
| 44 | 45 | Vermont | 227717.0 | 231512.0 | 623657.0 | 623354.0 | 0.365309 | 0.371217 |
| 45 | 46 | Virginia | 3128649.0 | 3204193.0 | 8470020.0 | 8414380.0 | 0.371822 | 0.378298 |
| 46 | 47 | Washington | 2429313.0 | 2532132.0 | 7405743.0 | 7280934.0 | 0.333654 | 0.341915 |
| 47 | 48 | West Virginia | 363285.0 | 366421.0 | 1815857.0 | 1828637.0 | 0.198664 | 0.201790 |
| 48 | 49 | Wisconsin | 1650746.0 | 1694242.0 | 5795483.0 | 5772917.0 | 0.285947 | 0.292338 |
| 49 | 50 | Wyoming | 152941.0 | 156015.0 | 579315.0 | 584910.0 | 0.261478 | 0.269309 |
(county_complete4['bachelors_percent_2017'] - county_complete4['bachelors_percent_2016']).mean()
0.005774049488857494
by average the increase percent of people with bachelors every year is 0.005
df['bachelors_percent'] = 0
i = 2016
tmp = county_complete4['bachelors_percent_2016'].values
while i >= 1976:
if i != 2016:
tmp = tmp * (1-0.005)
df.loc[df['year'] == i, 'bachelors_percent'] = tmp
i = i - 2
i = 2018
tmp = county_complete4['bachelors_percent_2016'].values
while i <= 2020:
tmp = tmp * (1+0.005)
df.loc[df['year'] == i, 'bachelors_percent'] = tmp
i = i + 2
df
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 | 0.721967 | 0.367481 | 0.514513 | 0.219164 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 | 0.734065 | 0.307173 | 0.479531 | 0.258126 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 | 0.817301 | 0.208421 | 0.502997 | 0.254150 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 | 0.725888 | 0.207810 | 0.508982 | 0.197363 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 | 0.726585 | 0.362544 | 0.502785 | 0.285990 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 6774559 | 0.781496 | 0.636319 | 0.509251 | 0.375549 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 5679975 | 0.757229 | 0.684121 | 0.501990 | 0.336999 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 1578350 | 0.837807 | 0.482393 | 0.507030 | 0.200656 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 4715247 | 0.792825 | 0.686280 | 0.503612 | 0.288813 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 471144 | 0.781868 | 0.573852 | 0.490027 | 0.264099 |
1150 rows × 9 columns
train_data = df[(df['year'] != 2010) & (df['year'] != 2012)]
test_data_2010 = df[df['year'] == 2010]
test_data_2012 = df[df['year'] == 2012]
# split dataset to features and label
X_train = train_data[['totalVotes', 'population', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent']]
y_train = train_data['voter_turnout']
X_test_2010 = test_data_2010[['totalVotes', 'population', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent']]
y_test_2010 = test_data_2010['voter_turnout']
X_test_2012 = test_data_2012[['totalVotes', 'population', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent']]
y_test_2012 = test_data_2012['voter_turnout']
# define model
ab = AdaBoostRegressor(random_state=RSEED)
# define parameter grid
parameters_grid = {
'n_estimators': [50, 100, 200, 300],
'learning_rate':[0.01, 0.1, 1.0]
}
# define grid search
grid_search = GridSearchCV(estimator=ab, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(X_train, y_train)
# get best estimator
best = grid_search.best_estimator_
# print best parameters
pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T
| learning_rate | n_estimators | |
|---|---|---|
| Selected Value | 1.0 | 300.0 |
# predict on test data 2010
y_pred_2010 = best.predict(X_test_2010)
# calculate MSE, MAE, RMSE and R Squared for train data
MSE = round(mean_squared_error(y_test_2010, y_pred_2010), 3)
MAE = round(mean_absolute_error(y_test_2010,y_pred_2010), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2010,y_pred_2010)), 3)
R2 = round(r2_score(y_test_2010,y_pred_2010), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.005 | 0.058 | 0.071 | -0.194 |
tmp2010 = df[df['year'] == 2010]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 |
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 |
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 |
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 |
tmp2010['predicted_turnout'] = y_pred_2010
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2010 = tmp2010[tmp2010['state'].isin(state_list)]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.459730 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.392804 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.512252 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.455831 |
# plot the dataframe
tmp2010.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2010")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(AdaBoostRegressor(random_state=RSEED, learning_rate=1.0, n_estimators=300), X_train, y_train, X_test_2010, y_test_2010)
visualizer = PredictionError(AdaBoostRegressor(random_state=RSEED, learning_rate=1.0, n_estimators=300))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2010, y_test_2010) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for AdaBoostRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2010]
states5['predicted_turnout'] = y_pred_2010
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 | 0.504164 | 0.165817 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 | 0.479731 | 0.138852 |
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 | 0.481436 | 0.138408 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 | 0.489233 | 0.122238 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 | 0.379802 | 0.120327 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.459730 | 0.114729 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 | 0.502215 | 0.109258 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 | 0.423926 | 0.108490 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 | 0.482796 | 0.101839 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 | 0.388761 | 0.090846 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 | 0.483192 | 0.090454 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 | 0.426366 | 0.085767 |
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 | 0.469367 | 0.082052 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 | 0.437205 | 0.081588 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 | 0.437205 | 0.080688 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 | 0.502215 | 0.078442 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 | 0.456187 | 0.078366 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 | 0.481418 | 0.077000 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 | 0.475896 | 0.076649 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 | 0.418019 | 0.072870 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 | 0.388206 | 0.066682 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 | 0.504745 | 0.063312 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 | 0.471649 | 0.059058 |
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 | 0.425759 | 0.052914 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 | 0.465203 | 0.052366 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 | 0.488089 | 0.051129 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 | 0.328256 | 0.047227 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 | 0.481436 | 0.047086 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.392804 | 0.047080 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 | 0.391468 | 0.040399 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 | 0.395209 | 0.038683 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 | 0.498661 | 0.036865 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 | 0.443323 | 0.034941 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 | 0.504745 | 0.034199 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 | 0.392370 | 0.033872 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 | 0.513525 | 0.031516 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 | 0.494222 | 0.030613 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 | 0.324341 | 0.020997 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.455831 | 0.019871 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 | 0.502215 | 0.017540 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 | 0.334404 | 0.016583 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 | 0.464119 | 0.011920 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 | 0.431710 | 0.011027 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.512252 | 0.010235 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 | 0.425645 | 0.007137 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 | 0.478054 | 0.005997 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 | 0.474959 | 0.005141 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 | 0.488368 | 0.004409 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 | 0.485345 | 0.004090 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 | 0.493941 | 0.002071 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# predict on test data 2012
y_pred_2012 = best.predict(X_test_2012)
# calculate MSE, MAE, RMSE and R Squared for test data
MSE = round(mean_squared_error(y_test_2012, y_pred_2012), 3)
MAE = round(mean_absolute_error(y_test_2012,y_pred_2012), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2012,y_pred_2012)), 3)
R2 = round(r2_score(y_test_2012,y_pred_2012), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.004 | 0.051 | 0.063 | 0.181 |
tmp2012 = df[df['year'] == 2012]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 |
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 |
tmp2012['predicted_turnout'] = y_pred_2012
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2012 = tmp2012[tmp2012['state'].isin(state_list)]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.461005 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.431548 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.517877 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.473714 |
# plot the dataframe
tmp2012.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2012")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(AdaBoostRegressor(random_state=RSEED, learning_rate=1.0, n_estimators=300), X_train, y_train, X_test_2012, y_test_2012)
visualizer = PredictionError(AdaBoostRegressor(random_state=RSEED, learning_rate=1.0, n_estimators=300))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2012, y_test_2012) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for AdaBoostRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2012]
states5['predicted_turnout'] = y_pred_2012
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 | 0.521597 | 0.133502 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 | 0.418019 | 0.123074 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 | 0.565253 | 0.122089 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 | 0.541593 | 0.114534 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 | 0.539276 | 0.112665 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 | 0.544712 | 0.110904 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 | 0.519217 | 0.097180 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 | 0.481421 | 0.091738 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 | 0.521005 | 0.082623 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 | 0.499037 | 0.082138 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 | 0.510996 | 0.076889 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.473714 | 0.076110 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 | 0.507004 | 0.073731 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 | 0.551208 | 0.070560 |
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 | 0.455022 | 0.067972 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 | 0.513525 | 0.067695 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 | 0.515246 | 0.065413 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 | 0.400531 | 0.063966 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 | 0.494222 | 0.061931 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 | 0.502215 | 0.061413 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 | 0.530747 | 0.061132 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.431548 | 0.060451 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.517877 | 0.059906 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 | 0.503844 | 0.052139 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 | 0.499040 | 0.050564 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 | 0.481418 | 0.043922 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 | 0.474326 | 0.043068 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 | 0.504111 | 0.042535 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 | 0.526719 | 0.041704 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.461005 | 0.035481 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 | 0.516472 | 0.035303 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 | 0.539276 | 0.030944 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 | 0.410446 | 0.025955 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 | 0.492435 | 0.025602 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 | 0.488089 | 0.025490 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 | 0.469367 | 0.021193 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 | 0.473006 | 0.020548 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 | 0.443323 | 0.018529 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 | 0.497857 | 0.017390 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 | 0.499725 | 0.017262 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 | 0.504164 | 0.015445 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 | 0.452690 | 0.014457 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 | 0.408257 | 0.013789 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 | 0.415900 | 0.013102 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 | 0.474176 | 0.012683 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 | 0.536471 | 0.009276 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 | 0.495621 | 0.005066 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 | 0.511585 | 0.003105 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 | 0.440662 | 0.002326 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 | 0.539865 | 0.000378 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# get feature importance
imp = pd.DataFrame(grid_search.best_estimator_.fit(X_train, y_train).feature_importances_,
index=X_train.columns, columns=['Importance']).sort_values('Importance')
imp = imp.nlargest(6, 'Importance')
imp
| Importance | |
|---|---|
| totalVotes | 0.386547 |
| population | 0.195172 |
| VAP | 0.193100 |
| bachelors_percent | 0.134423 |
| can_vote_percent | 0.070400 |
| female_percent | 0.020358 |
# plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(0, len(imp)), imp['Importance'], color='#9ecae1')
plt.grid(axis='x', alpha=0.5, color='lightgrey')
plt.yticks(range(0, len(imp)), imp.index)
plt.title('Feature Importance', fontsize=14)
plt.show()
# define model
dt = DecisionTreeRegressor(random_state=RSEED)
# define parameter grid
parameters_grid = {
'max_depth': [2, 4, 6, 8, 10],
'min_samples_split': [2, 4, 6, 8, 10],
'min_samples_leaf': [2, 4, 6, 8, 10]
}
# define grid search
grid_search = GridSearchCV(estimator=dt, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(X_train, y_train)
# get best estimator
best = grid_search.best_estimator_
# print best parameters
pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T
| max_depth | min_samples_leaf | min_samples_split | |
|---|---|---|---|
| Selected Value | 10 | 2 | 2 |
# predict on test data 2010
y_pred_2010 = best.predict(X_test_2010)
# calculate MSE, MAE, RMSE and R Squared for train data
MSE = round(mean_squared_error(y_test_2010, y_pred_2010), 3)
MAE = round(mean_absolute_error(y_test_2010,y_pred_2010), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2010,y_pred_2010)), 3)
R2 = round(r2_score(y_test_2010,y_pred_2010), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.006 | 0.06 | 0.076 | -0.346 |
tmp2010 = df[df['year'] == 2010]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 |
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 |
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 |
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 |
tmp2010['predicted_turnout'] = y_pred_2010
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2010 = tmp2010[tmp2010['state'].isin(state_list)]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.396975 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.464994 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.570330 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.472054 |
# plot the dataframe
tmp2010.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2010")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 2), X_train, y_train, X_test_2010, y_test_2010)
visualizer = PredictionError(DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 2))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2010, y_test_2010) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for DecisionTreeRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2010]
states5['predicted_turnout'] = y_pred_2010
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 | 0.570330 | 0.183015 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 | 0.570330 | 0.177591 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 | 0.522915 | 0.164417 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 | 0.438902 | 0.140987 |
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 | 0.461898 | 0.123551 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 | 0.381432 | 0.121957 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 | 0.461898 | 0.121300 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.464994 | 0.119270 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 | 0.486513 | 0.105556 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 | 0.500879 | 0.101632 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 | 0.570330 | 0.098273 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 | 0.568302 | 0.087047 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 | 0.352861 | 0.079921 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 | 0.491009 | 0.078173 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 | 0.568302 | 0.076431 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 | 0.275694 | 0.075375 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 | 0.407307 | 0.072793 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 | 0.461898 | 0.068941 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 | 0.531942 | 0.068333 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 | 0.316754 | 0.061067 |
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 | 0.316754 | 0.056091 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 | 0.410068 | 0.054452 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 | 0.491009 | 0.054049 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.396975 | 0.051974 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 | 0.408211 | 0.051695 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 | 0.491009 | 0.049576 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.570330 | 0.047843 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 | 0.491009 | 0.044517 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 | 0.442904 | 0.039105 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 | 0.383390 | 0.038241 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 | 0.388781 | 0.037794 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.472054 | 0.036094 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 | 0.388818 | 0.032292 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 | 0.466305 | 0.031955 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 | 0.421473 | 0.030726 |
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 | 0.371471 | 0.028443 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 | 0.440209 | 0.027618 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 | 0.258150 | 0.022879 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 | 0.360492 | 0.019612 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 | 0.322869 | 0.019525 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 | 0.385046 | 0.019372 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 | 0.488262 | 0.017715 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 | 0.304576 | 0.016948 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 | 0.331134 | 0.015698 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 | 0.421573 | 0.013191 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 | 0.531942 | 0.012187 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 | 0.425347 | 0.004664 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 | 0.371471 | 0.004477 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 | 0.479677 | 0.004282 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 | 0.425347 | 0.001574 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# predict on test data 2012
y_pred_2012 = best.predict(X_test_2012)
# calculate MSE, MAE, RMSE and R Squared for test data
MSE = round(mean_squared_error(y_test_2012, y_pred_2012), 3)
MAE = round(mean_absolute_error(y_test_2012,y_pred_2012), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2012,y_pred_2012)), 3)
R2 = round(r2_score(y_test_2012,y_pred_2012), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.005 | 0.051 | 0.072 | -0.079 |
tmp2012 = df[df['year'] == 2012]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 |
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 |
tmp2012['predicted_turnout'] = y_pred_2012
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2012 = tmp2012[tmp2012['state'].isin(state_list)]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.420785 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.354978 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.570330 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.536243 |
# plot the dataframe
tmp2012.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2012")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 2), X_train, y_train, X_test_2012, y_test_2012)
visualizer = PredictionError(DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 2))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2012, y_test_2012) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for DecisionTreeRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2012]
states5['predicted_turnout'] = y_pred_2012
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 | 0.472990 | 0.182108 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 | 0.425347 | 0.178281 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 | 0.491009 | 0.165118 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 | 0.491009 | 0.164607 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 | 0.425347 | 0.155873 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.354978 | 0.137022 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 | 0.385046 | 0.128534 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 | 0.466305 | 0.114870 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 | 0.461898 | 0.094084 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 | 0.534265 | 0.091277 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 | 0.491009 | 0.089726 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 | 0.570330 | 0.081611 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 | 0.491009 | 0.055637 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 | 0.519751 | 0.055254 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 | 0.417022 | 0.050125 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 | 0.544568 | 0.047312 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 | 0.461898 | 0.046582 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 | 0.538383 | 0.044829 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 | 0.472990 | 0.044404 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 | 0.607550 | 0.044390 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 | 0.568302 | 0.042961 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 | 0.525977 | 0.042447 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 | 0.728362 | 0.041019 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 | 0.581123 | 0.040645 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 | 0.457005 | 0.033554 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 | 0.570330 | 0.030842 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 | 0.360492 | 0.029192 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 | 0.609450 | 0.028791 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 | 0.461898 | 0.028657 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 | 0.581983 | 0.025830 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 | 0.570330 | 0.024583 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 | 0.525977 | 0.023627 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 | 0.541292 | 0.023255 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 | 0.381432 | 0.021366 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 | 0.606989 | 0.019104 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 | 0.438902 | 0.016856 |
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 | 0.538383 | 0.015389 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.536243 | 0.013581 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 | 0.603423 | 0.012974 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 | 0.529665 | 0.012678 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 | 0.493731 | 0.012562 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 | 0.444141 | 0.007740 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.570330 | 0.007453 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 | 0.547170 | 0.006078 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 | 0.486513 | 0.006046 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 | 0.575892 | 0.005672 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.420785 | 0.004739 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 | 0.568302 | 0.004673 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 | 0.466305 | 0.004452 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 | 0.459828 | 0.001665 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# get feature importance
imp = pd.DataFrame(grid_search.best_estimator_.fit(X_train, y_train).feature_importances_,
index=X_train.columns, columns=['Importance']).sort_values('Importance')
imp = imp.nlargest(6, 'Importance')
imp
| Importance | |
|---|---|
| totalVotes | 0.363255 |
| bachelors_percent | 0.214220 |
| VAP | 0.191044 |
| population | 0.113330 |
| can_vote_percent | 0.095493 |
| female_percent | 0.022659 |
# plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(0, len(imp)), imp['Importance'], color='#9ecae1')
plt.grid(axis='x', alpha=0.5, color='lightgrey')
plt.yticks(range(0, len(imp)), imp.index)
plt.title('Feature Importance', fontsize=14)
plt.show()
# define model
rf = RandomForestRegressor(random_state=RSEED)
# define parameter grid
parameters_grid = {
'max_depth': [2, 3, 4, 5, 6],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8],
'n_estimators': [20, 50, 100, 200, 500]
}
# define grid search
grid_search = GridSearchCV(estimator=rf, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(X_train, y_train)
# get best estimator
best = grid_search.best_estimator_
# print best parameters
pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T
| max_depth | min_samples_leaf | min_samples_split | n_estimators | |
|---|---|---|---|---|
| Selected Value | 6 | 2 | 2 | 500 |
# predict on test data 2010
y_pred_2010 = best.predict(X_test_2010)
# calculate MSE, MAE, RMSE and R Squared for train data
MSE = round(mean_squared_error(y_test_2010, y_pred_2010), 3)
MAE = round(mean_absolute_error(y_test_2010,y_pred_2010), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2010,y_pred_2010)), 3)
R2 = round(r2_score(y_test_2010,y_pred_2010), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.004 | 0.051 | 0.064 | 0.042 |
tmp2010 = df[df['year'] == 2010]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 |
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 |
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 |
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 |
tmp2010['predicted_turnout'] = y_pred_2010
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2010 = tmp2010[tmp2010['state'].isin(state_list)]
tmp2010
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.404177 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.379712 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.545748 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.481046 |
# plot the dataframe
tmp2010.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2010")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(RandomForestRegressor(max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 500), X_train, y_train, X_test_2010, y_test_2010)
visualizer = PredictionError(RandomForestRegressor(max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 500))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2010, y_test_2010) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for RandomForestRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2010]
states5['predicted_turnout'] = y_pred_2010
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 859 | 2010 | GEORGIA | 2468489 | 9687653 | 7196177 | 0.742819 | 0.343028 | 0.511759 | 0.291242 | 0.483028 | 0.140000 |
| 893 | 2010 | UTAH | 640495 | 2763885 | 1893012 | 0.684910 | 0.338347 | 0.497722 | 0.314749 | 0.470892 | 0.132545 |
| 882 | 2010 | NORTH CAROLINA | 2662110 | 9535483 | 7253813 | 0.760718 | 0.366995 | 0.512802 | 0.289643 | 0.482794 | 0.115799 |
| 892 | 2010 | TEXAS | 4744189 | 25145561 | 18283803 | 0.727119 | 0.259475 | 0.504085 | 0.277665 | 0.372356 | 0.112881 |
| 861 | 2010 | IDAHO | 447144 | 1567582 | 1138528 | 0.726296 | 0.392739 | 0.498864 | 0.259595 | 0.504935 | 0.112196 |
| 887 | 2010 | PENNSYLVANIA | 3956401 | 12702379 | 9909668 | 0.780143 | 0.399247 | 0.512632 | 0.290008 | 0.507959 | 0.108713 |
| 876 | 2010 | NEBRASKA | 465510 | 1826341 | 1366743 | 0.748350 | 0.340598 | 0.503600 | 0.298896 | 0.446753 | 0.106155 |
| 879 | 2010 | NEW JERSEY | 2121584 | 8791894 | 6725870 | 0.765008 | 0.315436 | 0.513255 | 0.368430 | 0.412008 | 0.096572 |
| 862 | 2010 | ILLINOIS | 3696108 | 12830632 | 9702168 | 0.756172 | 0.380957 | 0.509576 | 0.325989 | 0.474378 | 0.093422 |
| 865 | 2010 | KANSAS | 835529 | 2853118 | 2126260 | 0.745241 | 0.392957 | 0.503941 | 0.313608 | 0.484283 | 0.091326 |
| 852 | 2010 | ARIZONA | 1698135 | 6392017 | 4763130 | 0.745169 | 0.356517 | 0.502997 | 0.276757 | 0.445434 | 0.088918 |
| 885 | 2010 | OKLAHOMA | 792980 | 3751351 | 2821701 | 0.752183 | 0.281029 | 0.504977 | 0.243569 | 0.366868 | 0.085839 |
| 860 | 2010 | HAWAII | 360121 | 1360301 | 1056446 | 0.776627 | 0.340880 | 0.498815 | 0.309514 | 0.425117 | 0.084237 |
| 863 | 2010 | INDIANA | 1747640 | 6483802 | 4874896 | 0.751858 | 0.358498 | 0.508082 | 0.245926 | 0.440736 | 0.082238 |
| 880 | 2010 | NEW MEXICO | 596651 | 2059179 | 1540481 | 0.748104 | 0.387315 | 0.505851 | 0.259599 | 0.467032 | 0.079717 |
| 891 | 2010 | TENNESSEE | 1559120 | 6346105 | 4849154 | 0.764115 | 0.321524 | 0.512562 | 0.253275 | 0.389486 | 0.067962 |
| 895 | 2010 | VIRGINIA | 2184271 | 7994802 | 6142207 | 0.768275 | 0.355617 | 0.509251 | 0.366272 | 0.419253 | 0.063636 |
| 881 | 2010 | NEW YORK | 4484408 | 19378102 | 15052653 | 0.776787 | 0.297915 | 0.516116 | 0.339777 | 0.359899 | 0.061984 |
| 854 | 2010 | CALIFORNIA | 9644560 | 37253956 | 27955194 | 0.750395 | 0.345001 | 0.502785 | 0.311429 | 0.404177 | 0.059176 |
| 867 | 2010 | LOUISIANA | 1035948 | 4533372 | 3415094 | 0.753323 | 0.303344 | 0.510412 | 0.226974 | 0.362490 | 0.059146 |
| 886 | 2010 | OREGON | 1427027 | 3831074 | 2965220 | 0.773992 | 0.481255 | 0.504942 | 0.310570 | 0.432250 | 0.049005 |
| 868 | 2010 | MAINE | 564326 | 1328361 | 1053779 | 0.793293 | 0.535526 | 0.510782 | 0.289550 | 0.486672 | 0.048854 |
| 877 | 2010 | NEVADA | 702788 | 2700551 | 2036187 | 0.753989 | 0.345149 | 0.495283 | 0.228451 | 0.392932 | 0.047783 |
| 884 | 2010 | OHIO | 3825014 | 11536504 | 8806302 | 0.763342 | 0.434350 | 0.511837 | 0.264659 | 0.481518 | 0.047168 |
| 873 | 2010 | MISSISSIPPI | 788549 | 2967297 | 2211757 | 0.745378 | 0.356526 | 0.514309 | 0.209527 | 0.401887 | 0.045361 |
| 899 | 2010 | WYOMING | 186682 | 563626 | 428209 | 0.759740 | 0.435960 | 0.490027 | 0.257575 | 0.481046 | 0.045086 |
| 871 | 2010 | MICHIGAN | 3194857 | 9883640 | 7539072 | 0.762783 | 0.423773 | 0.509544 | 0.272328 | 0.464954 | 0.041181 |
| 857 | 2010 | DELAWARE | 305636 | 897934 | 692372 | 0.771072 | 0.441433 | 0.515663 | 0.300320 | 0.476642 | 0.035208 |
| 858 | 2010 | FLORIDA | 5116018 | 18801310 | 14797963 | 0.787071 | 0.345724 | 0.511339 | 0.275604 | 0.379712 | 0.033988 |
| 856 | 2010 | CONNECTICUT | 1138116 | 3574097 | 2756819 | 0.771333 | 0.412837 | 0.513458 | 0.375809 | 0.444067 | 0.031230 |
| 851 | 2010 | ALASKA | 252990 | 710231 | 522751 | 0.736030 | 0.483959 | 0.479531 | 0.281086 | 0.512266 | 0.028307 |
| 896 | 2010 | WASHINGTON | 2479409 | 6724540 | 5143903 | 0.764945 | 0.482009 | 0.501990 | 0.328674 | 0.507433 | 0.025424 |
| 888 | 2010 | RHODE ISLAND | 335004 | 1052567 | 828361 | 0.786991 | 0.404418 | 0.516701 | 0.318936 | 0.428571 | 0.024153 |
| 878 | 2010 | NEW HAMPSHIRE | 449787 | 1316470 | 1029354 | 0.781905 | 0.436960 | 0.506603 | 0.350103 | 0.460800 | 0.023840 |
| 890 | 2010 | SOUTH DAKOTA | 319426 | 814180 | 611357 | 0.750887 | 0.522487 | 0.499694 | 0.270392 | 0.545748 | 0.023261 |
| 894 | 2010 | VERMONT | 238335 | 625741 | 496428 | 0.793344 | 0.480100 | 0.507553 | 0.359857 | 0.457057 | 0.023043 |
| 889 | 2010 | SOUTH CAROLINA | 1339410 | 4625364 | 3545098 | 0.766447 | 0.377820 | 0.513504 | 0.261995 | 0.398621 | 0.020800 |
| 853 | 2010 | ARKANSAS | 773866 | 2915918 | 2204313 | 0.755959 | 0.351069 | 0.508982 | 0.214918 | 0.369899 | 0.018830 |
| 874 | 2010 | MISSOURI | 1919791 | 5988927 | 4563510 | 0.761991 | 0.420683 | 0.510262 | 0.273591 | 0.438878 | 0.018195 |
| 869 | 2010 | MARYLAND | 1823638 | 5773552 | 4419968 | 0.765554 | 0.412591 | 0.516285 | 0.378965 | 0.425392 | 0.012801 |
| 870 | 2010 | MASSACHUSETTS | 2219813 | 6547629 | 5129171 | 0.783363 | 0.432782 | 0.516478 | 0.405656 | 0.423760 | 0.009022 |
| 872 | 2010 | MINNESOTA | 2089062 | 5303925 | 4019316 | 0.757800 | 0.519756 | 0.503768 | 0.338504 | 0.512692 | 0.007064 |
| 898 | 2010 | WISCONSIN | 2138775 | 5686986 | 4348246 | 0.764596 | 0.491871 | 0.503612 | 0.281679 | 0.498431 | 0.006560 |
| 864 | 2010 | IOWA | 1094452 | 3046355 | 2318473 | 0.761065 | 0.472057 | 0.504930 | 0.273666 | 0.466436 | 0.005621 |
| 875 | 2010 | MONTANA | 360341 | 989415 | 765792 | 0.773985 | 0.470547 | 0.497990 | 0.297747 | 0.476159 | 0.005612 |
| 855 | 2010 | COLORADO | 1763106 | 5029196 | 3803001 | 0.756185 | 0.463609 | 0.498895 | 0.381234 | 0.468682 | 0.005073 |
| 897 | 2010 | WEST VIRGINIA | 514373 | 1852994 | 1465505 | 0.790885 | 0.350987 | 0.507030 | 0.195699 | 0.353526 | 0.002539 |
| 866 | 2010 | KENTUCKY | 1354051 | 4339367 | 3315647 | 0.764085 | 0.408382 | 0.507950 | 0.226168 | 0.410750 | 0.002368 |
| 883 | 2010 | NORTH DAKOTA | 236344 | 672591 | 522655 | 0.777077 | 0.452199 | 0.494625 | 0.280952 | 0.454541 | 0.002342 |
| 850 | 2010 | ALABAMA | 1359759 | 4779736 | 3646981 | 0.763009 | 0.372845 | 0.514513 | 0.238659 | 0.374811 | 0.001966 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# predict on test data 2012
y_pred_2012 = best.predict(X_test_2012)
# calculate MSE, MAE, RMSE and R Squared for test data
MSE = round(mean_squared_error(y_test_2012, y_pred_2012), 3)
MAE = round(mean_absolute_error(y_test_2012,y_pred_2012), 3)
RMSE = round(np.sqrt(mean_squared_error(y_test_2012,y_pred_2012)), 3)
R2 = round(r2_score(y_test_2012,y_pred_2012), 3)
df2 = pd.DataFrame([MSE, MAE, RMSE, R2]).T
df2 = df2.rename(index={0: 'AdaBoost Regressor'}, columns={0: 'MSE', 1:'MAE', 2:'RMSE', 3:"R2"})
df2
| MSE | MAE | RMSE | R2 | |
|---|---|---|---|---|
| AdaBoost Regressor | 0.004 | 0.051 | 0.063 | 0.187 |
tmp2012 = df[df['year'] == 2012]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 |
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 |
tmp2012['predicted_turnout'] = y_pred_2012
state_list = ['CALIFORNIA','FLORIDA','SOUTH DAKOTA','WYOMING']
tmp2012 = tmp2012[tmp2012['state'].isin(state_list)]
tmp2012
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | |
|---|---|---|---|---|---|---|---|---|---|---|
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.432106 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.418899 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.548594 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.514703 |
# plot the dataframe
tmp2012.plot(x="state", y=["voter_turnout", "predicted_turnout"], kind="bar", figsize=(9, 8))
# print bar graph
plt.title("Predicted Vs Actual - 2012")
plt.show()
# Create the visualizer, fit, score, and show it
viz = residuals_plot(RandomForestRegressor(max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 500), X_train, y_train, X_test_2012, y_test_2012)
visualizer = PredictionError(RandomForestRegressor(max_depth = 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 500))
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test_2012, y_test_2012) # Evaluate the model on the test data
visualizer.show()
<AxesSubplot:title={'center':'Prediction Error for RandomForestRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>
states5 = df[df['year'] == 2012]
states5['predicted_turnout'] = y_pred_2012
states5['abs_diff'] = abs(states5['voter_turnout'] - states5['predicted_turnout'])
states5.sort_values('abs_diff', ascending=False)
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | predicted_turnout | abs_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 914 | 2012 | IOWA | 1535469 | 3074386 | 2343874 | 0.762388 | 0.655099 | 0.504930 | 0.275041 | 0.494823 | 0.160276 |
| 918 | 2012 | MAINE | 693801 | 1328101 | 1058243 | 0.796809 | 0.655616 | 0.510782 | 0.291005 | 0.529999 | 0.125617 |
| 928 | 2012 | NEW HAMPSHIRE | 682018 | 1320923 | 1039460 | 0.786919 | 0.656127 | 0.506603 | 0.351863 | 0.531110 | 0.125017 |
| 936 | 2012 | OREGON | 1705571 | 3893920 | 3026055 | 0.777123 | 0.563629 | 0.504942 | 0.312131 | 0.444402 | 0.119227 |
| 925 | 2012 | MONTANA | 479740 | 1003522 | 778298 | 0.775566 | 0.616396 | 0.497990 | 0.299243 | 0.502426 | 0.113971 |
| 944 | 2012 | VERMONT | 289663 | 625606 | 498787 | 0.797286 | 0.580735 | 0.507553 | 0.361665 | 0.483473 | 0.097262 |
| 921 | 2012 | MICHIGAN | 4574615 | 9886610 | 7578532 | 0.766545 | 0.603628 | 0.509544 | 0.273696 | 0.511758 | 0.091870 |
| 933 | 2012 | NORTH DAKOTA | 315716 | 701380 | 543720 | 0.775215 | 0.580659 | 0.494625 | 0.282364 | 0.489721 | 0.090939 |
| 922 | 2012 | MINNESOTA | 2807826 | 5377695 | 4085045 | 0.759627 | 0.687343 | 0.503768 | 0.340205 | 0.606497 | 0.080846 |
| 908 | 2012 | FLORIDA | 7512911 | 19341327 | 15270158 | 0.789509 | 0.492000 | 0.511339 | 0.276989 | 0.418899 | 0.073101 |
| 906 | 2012 | CONNECTICUT | 1465487 | 3597705 | 2789596 | 0.775382 | 0.525340 | 0.513458 | 0.377697 | 0.454089 | 0.071251 |
| 924 | 2012 | MISSOURI | 2675885 | 6023267 | 4603905 | 0.764353 | 0.581221 | 0.510262 | 0.274966 | 0.510825 | 0.070396 |
| 934 | 2012 | OHIO | 5140157 | 11546969 | 8844420 | 0.765952 | 0.581175 | 0.511837 | 0.265989 | 0.514217 | 0.066958 |
| 926 | 2012 | NEBRASKA | 772515 | 1854862 | 1389460 | 0.749091 | 0.555982 | 0.503600 | 0.300398 | 0.489356 | 0.066626 |
| 945 | 2012 | VIRGINIA | 3733561 | 8188656 | 6307975 | 0.770331 | 0.591879 | 0.509251 | 0.368113 | 0.525842 | 0.066037 |
| 916 | 2012 | KENTUCKY | 1737037 | 4383673 | 3357281 | 0.765860 | 0.517394 | 0.507950 | 0.227304 | 0.451613 | 0.065781 |
| 907 | 2012 | DELAWARE | 388059 | 916868 | 709891 | 0.774256 | 0.546646 | 0.515663 | 0.301829 | 0.481796 | 0.064850 |
| 902 | 2012 | ARIZONA | 2173259 | 6544211 | 4905918 | 0.749658 | 0.442987 | 0.502997 | 0.278148 | 0.501762 | 0.058775 |
| 948 | 2012 | WISCONSIN | 2862341 | 5721075 | 4390494 | 0.767425 | 0.651941 | 0.503612 | 0.283094 | 0.593358 | 0.058582 |
| 939 | 2012 | SOUTH CAROLINA | 1791578 | 4719009 | 3629956 | 0.769220 | 0.493554 | 0.513504 | 0.263311 | 0.437852 | 0.055702 |
| 910 | 2012 | HAWAII | 422539 | 1392772 | 1084314 | 0.778529 | 0.389683 | 0.498815 | 0.311069 | 0.443559 | 0.053875 |
| 932 | 2012 | NORTH CAROLINA | 4379666 | 9755299 | 7449864 | 0.763674 | 0.587885 | 0.512802 | 0.291098 | 0.536439 | 0.051447 |
| 900 | 2012 | ALABAMA | 1927122 | 4813946 | 3684787 | 0.765440 | 0.522994 | 0.514513 | 0.239858 | 0.471780 | 0.051214 |
| 923 | 2012 | MISSISSIPPI | 1208175 | 2982963 | 2232845 | 0.748533 | 0.541092 | 0.514309 | 0.210580 | 0.492603 | 0.048489 |
| 946 | 2012 | WASHINGTON | 3006266 | 6890899 | 5288778 | 0.767502 | 0.568424 | 0.501990 | 0.330326 | 0.520975 | 0.047449 |
| 938 | 2012 | RHODE ISLAND | 427321 | 1052761 | 832044 | 0.790345 | 0.513580 | 0.516701 | 0.320538 | 0.466920 | 0.046660 |
| 917 | 2012 | LOUISIANA | 1705617 | 4602681 | 3476883 | 0.755404 | 0.490559 | 0.510412 | 0.228115 | 0.444284 | 0.046276 |
| 920 | 2012 | MASSACHUSETTS | 2879565 | 6659627 | 5239351 | 0.786733 | 0.549603 | 0.516478 | 0.407694 | 0.507523 | 0.042080 |
| 905 | 2012 | COLORADO | 2450488 | 5186330 | 3941163 | 0.759914 | 0.621768 | 0.498895 | 0.383150 | 0.580642 | 0.041126 |
| 947 | 2012 | WEST VIRGINIA | 641354 | 1855360 | 1469645 | 0.792108 | 0.436401 | 0.507030 | 0.196683 | 0.396584 | 0.039817 |
| 949 | 2012 | WYOMING | 241205 | 576608 | 438695 | 0.760820 | 0.549824 | 0.490027 | 0.258870 | 0.514703 | 0.035121 |
| 927 | 2012 | NEVADA | 973742 | 2752410 | 2084445 | 0.757316 | 0.467147 | 0.495283 | 0.229599 | 0.434209 | 0.032938 |
| 903 | 2012 | ARKANSAS | 1038054 | 2949208 | 2234791 | 0.757760 | 0.464497 | 0.508982 | 0.215998 | 0.432620 | 0.031878 |
| 935 | 2012 | OKLAHOMA | 1325935 | 3815298 | 2873141 | 0.753058 | 0.461493 | 0.504977 | 0.244793 | 0.430857 | 0.030636 |
| 919 | 2012 | MARYLAND | 2579538 | 5891680 | 4523759 | 0.767822 | 0.570220 | 0.516285 | 0.380869 | 0.540889 | 0.029331 |
| 937 | 2012 | PENNSYLVANIA | 5556330 | 12768034 | 9990642 | 0.782473 | 0.556153 | 0.512632 | 0.291465 | 0.526903 | 0.029250 |
| 940 | 2012 | SOUTH DAKOTA | 361429 | 832576 | 625545 | 0.751337 | 0.577783 | 0.499694 | 0.271751 | 0.548594 | 0.029189 |
| 909 | 2012 | GEORGIA | 3552967 | 9911171 | 7394822 | 0.746110 | 0.480467 | 0.511759 | 0.292705 | 0.509338 | 0.028871 |
| 911 | 2012 | IDAHO | 634983 | 1594673 | 1163512 | 0.729624 | 0.545747 | 0.498864 | 0.260899 | 0.520293 | 0.025454 |
| 913 | 2012 | INDIANA | 2553743 | 6535665 | 4929657 | 0.754270 | 0.518037 | 0.508082 | 0.247161 | 0.504288 | 0.013749 |
| 901 | 2012 | ALASKA | 288840 | 725255 | 535397 | 0.738219 | 0.539488 | 0.479531 | 0.282498 | 0.552385 | 0.012898 |
| 942 | 2012 | TEXAS | 7663983 | 26078327 | 19026859 | 0.729604 | 0.402798 | 0.504085 | 0.279061 | 0.391651 | 0.011147 |
| 929 | 2012 | NEW JERSEY | 3281778 | 8882095 | 6820431 | 0.767885 | 0.481169 | 0.513255 | 0.370281 | 0.488678 | 0.007509 |
| 904 | 2012 | CALIFORNIA | 12204357 | 38019006 | 28680763 | 0.754380 | 0.425524 | 0.502785 | 0.312994 | 0.432106 | 0.006581 |
| 941 | 2012 | TENNESSEE | 2283173 | 6450632 | 4943511 | 0.766361 | 0.461853 | 0.512562 | 0.254547 | 0.468248 | 0.006395 |
| 943 | 2012 | UTAH | 998897 | 2854222 | 1964476 | 0.688270 | 0.508480 | 0.497722 | 0.316331 | 0.502158 | 0.006322 |
| 912 | 2012 | ILLINOIS | 5057772 | 12878494 | 9783173 | 0.759652 | 0.516987 | 0.509576 | 0.327627 | 0.511092 | 0.005895 |
| 915 | 2012 | KANSAS | 1057739 | 2885316 | 2156209 | 0.747304 | 0.490555 | 0.503941 | 0.315184 | 0.485753 | 0.004802 |
| 930 | 2012 | NEW MEXICO | 765458 | 2083590 | 1566254 | 0.751709 | 0.488719 | 0.505851 | 0.260903 | 0.485573 | 0.003146 |
| 931 | 2012 | NEW YORK | 6456343 | 19625409 | 15297720 | 0.779485 | 0.422046 | 0.516116 | 0.341484 | 0.421450 | 0.000596 |
top 5 states for which turnout estimate was least successful we can see that the states with the highest difference between predicted and actual values (abs_diff) are:
# get feature importance
imp = pd.DataFrame(grid_search.best_estimator_.fit(X_train, y_train).feature_importances_,
index=X_train.columns, columns=['Importance']).sort_values('Importance')
imp = imp.nlargest(6, 'Importance')
imp
| Importance | |
|---|---|
| totalVotes | 0.366492 |
| VAP | 0.182014 |
| bachelors_percent | 0.172571 |
| population | 0.131256 |
| can_vote_percent | 0.121287 |
| female_percent | 0.026379 |
# plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(0, len(imp)), imp['Importance'], color='#9ecae1')
plt.grid(axis='x', alpha=0.5, color='lightgrey')
plt.yticks(range(0, len(imp)), imp.index)
plt.title('Feature Importance', fontsize=14)
plt.show()
| Model | MSE | MAE | R2 |
|---|---|---|---|
| AdaBoostRegressor | 0.005 | 0.058 | -0.194 |
| DecisionTreeRegressor | 0.006 | 0.06 | -0.346 |
| RandomForestRegressor | 0.004 | 0.051 | 0.042 |
the results we got for the three models are pretty similar to each other in terms of MSE and MAE, but in terms of overall performance and looking at R2 we can see that the RandomForestRegressor is much better and it is prefrred on the other tow.
| Model | MSE | MAE | R2 |
|---|---|---|---|
| AdaBoostRegressor | 0.004 | 0.051 | 0.181 |
| DecisionTreeRegressor | 0.005 | 0.051 | -0.079 |
| RandomForestRegressor | 0.004 | 0.051 | 0.187 |
we notice that the DecisionTreeRegressor performed the worst, while the other tow performed pretty similar to each other and are therefore much more reilable to use, but again since R2 is better in the RandomForestRegressor then it is still a winner over the AdaBoostRegressor
house_elections
| year | state | office | district | candidate | party | candidatevotes | totalvotes | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | US HOUSE | 1 | BILL DAVENPORT | DEMOCRAT | 58906 | 157170 |
| 1 | 1976 | ALABAMA | US HOUSE | 1 | JACK EDWARDS | REPUBLICAN | 98257 | 157170 |
| 3 | 1976 | ALABAMA | US HOUSE | 2 | J CAROLE KEAHEY | DEMOCRAT | 66288 | 156362 |
| 4 | 1976 | ALABAMA | US HOUSE | 2 | WILLIAM L "BILL" DICKINSON | REPUBLICAN | 90069 | 156362 |
| 6 | 1976 | ALABAMA | US HOUSE | 3 | BILL NICHOLS | DEMOCRAT | 106935 | 108048 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31094 | 2020 | WISCONSIN | US HOUSE | 8 | MIKE GALLAGHER | REPUBLICAN | 268173 | 417838 |
| 31096 | 2020 | WYOMING | US HOUSE | 0 | JEFF HAGGIT | CONSTITUTION | 7905 | 278503 |
| 31097 | 2020 | WYOMING | US HOUSE | 0 | LIZ CHENEY | REPUBLICAN | 185732 | 278503 |
| 31098 | 2020 | WYOMING | US HOUSE | 0 | LYNNETTE GREY BULL | DEMOCRAT | 66576 | 278503 |
| 31100 | 2020 | WYOMING | US HOUSE | 0 | RICHARD BRUBAKER | LIBERTARIAN | 10154 | 278503 |
27483 rows × 8 columns
grouped_data = house_elections.groupby(['year', 'state', 'party'], as_index=False)['candidatevotes'].sum()
grouped_data
| year | state | party | candidatevotes | |
|---|---|---|---|---|
| 0 | 1976 | ALABAMA | DEMOCRAT | 667052 |
| 1 | 1976 | ALABAMA | NATIONAL DEMOCRAT | 1021 |
| 2 | 1976 | ALABAMA | PROHIBITION | 1111 |
| 3 | 1976 | ALABAMA | REPUBLICAN | 314970 |
| 4 | 1976 | ALASKA | DEMOCRAT | 34194 |
| ... | ... | ... | ... | ... |
| 4730 | 2020 | WISCONSIN | REPUBLICAN | 1661399 |
| 4731 | 2020 | WYOMING | CONSTITUTION | 7905 |
| 4732 | 2020 | WYOMING | DEMOCRAT | 66576 |
| 4733 | 2020 | WYOMING | LIBERTARIAN | 10154 |
| 4734 | 2020 | WYOMING | REPUBLICAN | 185732 |
4735 rows × 4 columns
grouped_data = grouped_data.loc[grouped_data.groupby(["year", "state"])["candidatevotes"].idxmax()]
grouped_data
| year | state | party | candidatevotes | |
|---|---|---|---|---|
| 0 | 1976 | ALABAMA | DEMOCRAT | 667052 |
| 5 | 1976 | ALASKA | REPUBLICAN | 83722 |
| 6 | 1976 | ARIZONA | DEMOCRAT | 355747 |
| 10 | 1976 | ARKANSAS | DEMOCRAT | 260998 |
| 13 | 1976 | CALIFORNIA | DEMOCRAT | 4144324 |
| ... | ... | ... | ... | ... |
| 4720 | 2020 | VIRGINIA | DEMOCRAT | 2253974 |
| 4723 | 2020 | WASHINGTON | DEMOCRAT | 2340356 |
| 4726 | 2020 | WEST VIRGINIA | REPUBLICAN | 514268 |
| 4730 | 2020 | WISCONSIN | REPUBLICAN | 1661399 |
| 4734 | 2020 | WYOMING | REPUBLICAN | 185732 |
1151 rows × 4 columns
grouped_data.loc[grouped_data['party'] == 'DEMOCRAT', 'state_class'] = 'D'
grouped_data.loc[grouped_data['party'] == 'REPUBLICAN', 'state_class'] = 'R'
grouped_data = grouped_data.drop('party', 1)
grouped_data = grouped_data[grouped_data['state'] != 'DISTRICT OF COLUMBIA'].reset_index()
grouped_data
| index | year | state | candidatevotes | state_class | |
|---|---|---|---|---|---|
| 0 | 0 | 1976 | ALABAMA | 667052 | D |
| 1 | 5 | 1976 | ALASKA | 83722 | R |
| 2 | 6 | 1976 | ARIZONA | 355747 | D |
| 3 | 10 | 1976 | ARKANSAS | 260998 | D |
| 4 | 13 | 1976 | CALIFORNIA | 4144324 | D |
| ... | ... | ... | ... | ... | ... |
| 1145 | 4720 | 2020 | VIRGINIA | 2253974 | D |
| 1146 | 4723 | 2020 | WASHINGTON | 2340356 | D |
| 1147 | 4726 | 2020 | WEST VIRGINIA | 514268 | R |
| 1148 | 4730 | 2020 | WISCONSIN | 1661399 | R |
| 1149 | 4734 | 2020 | WYOMING | 185732 | R |
1150 rows × 5 columns
df
| year | state | totalVotes | population | VAP | can_vote_percent | voter_turnout | female_percent | bachelors_percent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | 984154 | 3709468 | 2678112 | 0.721967 | 0.367481 | 0.514513 | 0.219164 |
| 1 | 1976 | ALASKA | 117916 | 522944 | 383875 | 0.734065 | 0.307173 | 0.479531 | 0.258126 |
| 2 | 1976 | ARIZONA | 729002 | 4279624 | 3497741 | 0.817301 | 0.208421 | 0.502997 | 0.254150 |
| 3 | 1976 | ARKANSAS | 336383 | 2229968 | 1618708 | 0.725888 | 0.207810 | 0.508982 | 0.197363 |
| 4 | 1976 | CALIFORNIA | 7442501 | 28253424 | 20528527 | 0.726585 | 0.362544 | 0.502785 | 0.285990 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1146 | 2020 | VIRGINIA | 4310779 | 8668704 | 6774559 | 0.781496 | 0.636319 | 0.509251 | 0.375549 |
| 1147 | 2020 | WASHINGTON | 3885792 | 7501000 | 5679975 | 0.757229 | 0.684121 | 0.501990 | 0.336999 |
| 1148 | 2020 | WEST VIRGINIA | 761385 | 1883907 | 1578350 | 0.837807 | 0.482393 | 0.507030 | 0.200656 |
| 1149 | 2020 | WISCONSIN | 3235981 | 5947403 | 4715247 | 0.792825 | 0.686280 | 0.503612 | 0.288813 |
| 1150 | 2020 | WYOMING | 270367 | 602588 | 471144 | 0.781868 | 0.573852 | 0.490027 | 0.264099 |
1150 rows × 9 columns
grouped_data['VAP'] = df['VAP']
grouped_data['female_percent'] = df['female_percent']
grouped_data['bachelors_percent'] = df['bachelors_percent']
grouped_data['can_vote_percent'] = df['can_vote_percent']
grouped_data['voter_turnout'] = df['voter_turnout']
grouped_data = grouped_data.drop('index', 1)
grouped_data.isnull().sum()
year 0 state 0 candidatevotes 0 state_class 17 VAP 1 female_percent 1 bachelors_percent 1 can_vote_percent 1 voter_turnout 1 dtype: int64
grouped_data = grouped_data.dropna(how='any',axis=0)
grouped_data.isnull().sum()
year 0 state 0 candidatevotes 0 state_class 0 VAP 0 female_percent 0 bachelors_percent 0 can_vote_percent 0 voter_turnout 0 dtype: int64
train_data = grouped_data[(grouped_data['year'] != 2010) & (grouped_data['year'] != 2012)]
test_data_2010 = grouped_data[grouped_data['year'] == 2010]
test_data_2012 = grouped_data[grouped_data['year'] == 2012]
# split dataset to features and label
X_train = train_data[['candidatevotes', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent', 'voter_turnout']]
y_train = train_data['state_class']
X_test_2010 = test_data_2010[['candidatevotes', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent', 'voter_turnout']]
y_test_2010 = test_data_2010['state_class']
X_test_2012 = test_data_2012[['candidatevotes', 'VAP', 'can_vote_percent', 'female_percent', 'bachelors_percent', 'voter_turnout']]
y_test_2012 = test_data_2012['state_class']
#Normalize the data
mapper = DataFrameMapper([(X_train.columns, StandardScaler())])
scaled_features = mapper.fit_transform(X_train.copy(), 4)
X_train_scaled = pd.DataFrame(scaled_features, index=X_train.index, columns=X_train.columns)
X_train_scaled
| candidatevotes | VAP | can_vote_percent | female_percent | bachelors_percent | voter_turnout | |
|---|---|---|---|---|---|---|
| 0 | -0.294685 | -0.330540 | -0.836712 | 1.061433 | -1.323680 | -0.573026 |
| 1 | -0.853522 | -0.822472 | -0.554075 | -3.628490 | -0.500820 | -1.035532 |
| 2 | -0.592919 | -0.154795 | 1.390405 | -0.482556 | -0.584780 | -1.792873 |
| 3 | -0.683690 | -0.557699 | -0.745093 | 0.319851 | -1.784116 | -1.797560 |
| 4 | 3.036587 | 3.496959 | -0.728810 | -0.510980 | 0.087663 | -0.610882 |
| ... | ... | ... | ... | ... | ... | ... |
| 1145 | 1.225607 | -0.789922 | 1.783793 | 0.128364 | 1.840198 | 1.681021 |
| 1146 | 1.308362 | 0.547823 | 0.553965 | 0.355885 | 1.979117 | 1.488724 |
| 1147 | -0.441054 | 0.313121 | -0.012943 | -0.617509 | 1.164949 | 1.855327 |
| 1148 | 0.657912 | -0.566352 | 1.869442 | 0.058220 | -1.714563 | 0.308250 |
| 1149 | -0.755796 | 0.106264 | 0.818607 | -0.400088 | 0.147285 | 1.871884 |
1032 rows × 6 columns
#Normalize the data - 2010
mapper = DataFrameMapper([(X_test_2010.columns, StandardScaler())])
scaled_features = mapper.fit_transform(X_test_2010.copy(), 4)
X_test_scaled_2010 = pd.DataFrame(scaled_features, index=X_test_2010.index, columns=X_test_2010.columns)
X_test_scaled_2010
| candidatevotes | VAP | can_vote_percent | female_percent | bachelors_percent | voter_turnout | |
|---|---|---|---|---|---|---|
| 850 | -0.057883 | -0.202280 | 0.093931 | 1.071788 | -1.111750 | -0.384557 |
| 851 | -0.847113 | -0.813405 | -1.333973 | -3.644393 | -0.229386 | 1.318053 |
| 852 | -0.072764 | 0.016048 | -0.850284 | -0.480846 | -0.319417 | -0.634760 |
| 853 | -0.569423 | -0.484478 | -0.279216 | 0.326054 | -1.605482 | -0.718234 |
| 854 | 4.463940 | 4.552604 | -0.573652 | -0.509429 | 0.401653 | -0.811221 |
| 855 | -0.090360 | -0.171761 | -0.267245 | -1.033759 | 1.853397 | 1.006231 |
| 856 | -0.356354 | -0.376403 | 0.534492 | 0.929509 | 1.740566 | 0.228239 |
| 857 | -0.849079 | -0.780226 | 0.520692 | 1.226758 | 0.170619 | 0.666427 |
| 858 | 2.174174 | 1.978943 | 1.367431 | 0.643827 | -0.343395 | -0.800129 |
| 859 | 0.597473 | 0.491971 | -0.974612 | 0.700485 | -0.018173 | -0.841450 |
| 860 | -0.792602 | -0.709010 | 0.814662 | -1.044559 | 0.361828 | -0.874366 |
| 861 | -0.752803 | -0.692954 | -1.849147 | -1.037932 | -0.676336 | -0.079725 |
| 862 | 0.969282 | 0.982164 | -0.267903 | 0.406160 | 0.704469 | -0.260258 |
| 863 | 0.004296 | 0.037910 | -0.496258 | 0.204814 | -0.960616 | -0.604401 |
| 864 | -0.396435 | -0.462147 | -0.008971 | -0.220194 | -0.383694 | 1.135683 |
| 865 | -0.470415 | -0.499746 | -0.846458 | -0.353534 | 0.446968 | -0.076378 |
| 866 | -0.132716 | -0.267092 | 0.150907 | 0.186943 | -1.371520 | 0.159981 |
| 867 | -0.313169 | -0.247639 | -0.418695 | 0.518925 | -1.354754 | -1.449532 |
| 868 | -0.696785 | -0.709532 | 1.696724 | 0.568774 | -0.053367 | 2.108221 |
| 869 | 0.144599 | -0.051078 | 0.228653 | 1.310612 | 1.806197 | 0.224471 |
| 870 | 0.392009 | 0.087648 | 1.171195 | 1.336700 | 2.361296 | 0.533864 |
| 871 | 0.750784 | 0.559044 | 0.081973 | 0.401811 | -0.411531 | 0.395821 |
| 872 | 0.002235 | -0.129448 | -0.181737 | -0.376894 | 0.964732 | 1.866570 |
| 873 | -0.582070 | -0.483022 | -0.839214 | 1.044286 | -1.717595 | -0.634615 |
| 874 | 0.143781 | -0.023000 | 0.040072 | 0.498638 | -0.385264 | 0.348469 |
| 875 | -0.801929 | -0.765864 | 0.674831 | -1.155799 | 0.117107 | 1.112538 |
| 876 | -0.684152 | -0.648313 | -0.681882 | -0.399451 | 0.141012 | -0.878682 |
| 877 | -0.652774 | -0.517365 | -0.383429 | -1.520832 | -1.324044 | -0.808947 |
| 878 | -0.788506 | -0.714309 | 1.094005 | 0.005291 | 1.205971 | 0.597891 |
| 879 | 0.092533 | 0.399976 | 0.199741 | 0.902143 | 1.587099 | -1.264238 |
| 880 | -0.705744 | -0.614329 | -0.694898 | -0.096009 | -0.676256 | -0.162837 |
| 881 | 1.743053 | 2.028762 | 0.823136 | 1.287913 | 0.991206 | -1.532723 |
| 882 | 0.504323 | 0.503245 | -0.027319 | 0.841028 | -0.051434 | -0.474205 |
| 883 | -0.895789 | -0.813424 | 0.838498 | -1.609557 | -0.232170 | 0.831390 |
| 884 | 1.158040 | 0.806925 | 0.111580 | 0.711047 | -0.571025 | 0.557884 |
| 885 | -0.479570 | -0.363712 | -0.479058 | -0.213891 | -1.009617 | -1.791465 |
| 886 | -0.251251 | -0.335638 | 0.675213 | -0.218545 | 0.383797 | 1.276621 |
| 887 | 1.137825 | 1.022752 | 1.000752 | 0.818197 | -0.043841 | 0.019996 |
| 888 | -0.836085 | -0.753625 | 1.363217 | 1.366696 | 0.557775 | 0.099237 |
| 889 | -0.229292 | -0.222209 | 0.275915 | 0.935769 | -0.626429 | -0.308321 |
| 890 | -0.870266 | -0.796073 | -0.547641 | -0.926077 | -0.451779 | 1.908422 |
| 891 | -0.014491 | 0.032875 | 0.152473 | 0.808775 | -0.807778 | -1.170954 |
| 892 | 2.231398 | 2.660801 | -1.805596 | -0.334114 | -0.300526 | -2.121741 |
| 893 | -0.616893 | -0.545371 | -4.039526 | -1.192027 | 0.470705 | -0.913175 |
| 894 | -0.869942 | -0.818554 | 1.699453 | 0.133495 | 1.408815 | 1.258920 |
| 895 | 0.232211 | 0.285806 | 0.372648 | 0.362290 | 1.542232 | -0.648551 |
| 896 | 0.350109 | 0.090530 | 0.196400 | -0.616554 | 0.760306 | 1.288179 |
| 897 | -0.732101 | -0.628995 | 1.569293 | 0.062958 | -2.005173 | -0.719493 |
| 898 | 0.210493 | -0.065107 | 0.177924 | -0.397916 | -0.217057 | 1.439287 |
| 899 | -0.893804 | -0.831898 | -0.079098 | -2.229375 | -0.718339 | 0.582560 |
#Normalize the data - 2010
mapper = DataFrameMapper([(X_test_2012.columns, StandardScaler())])
scaled_features = mapper.fit_transform(X_test_2012.copy(), 4)
X_test_scaled_2012 = pd.DataFrame(scaled_features, index=X_test_2012.index, columns=X_test_2012.columns)
X_test_scaled_2012
| candidatevotes | VAP | can_vote_percent | female_percent | bachelors_percent | voter_turnout | |
|---|---|---|---|---|---|---|
| 900 | -0.095371 | -0.208206 | 0.086671 | 1.071788 | -1.111750 | -0.167539 |
| 901 | -0.883483 | -0.808188 | -1.354648 | -3.644393 | -0.229386 | 0.070422 |
| 902 | -0.172023 | 0.024428 | -0.748984 | -0.480846 | -0.319417 | -1.321849 |
| 903 | -0.543457 | -0.484441 | -0.319992 | 0.326054 | -1.605482 | -1.011512 |
| 904 | 4.534902 | 4.553709 | -0.498963 | -0.509429 | 0.401653 | -1.573800 |
| 905 | -0.162902 | -0.159365 | -0.205943 | -1.033759 | 1.853397 | 1.257530 |
| 906 | -0.357912 | -0.378747 | 0.613089 | 0.929509 | 1.740566 | -0.133690 |
| 907 | -0.834890 | -0.774945 | 0.553491 | 1.226758 | 0.170619 | 0.173700 |
| 908 | 1.853919 | 1.998891 | 1.361108 | 0.643827 | -0.343395 | -0.614717 |
| 909 | 0.559034 | 0.498583 | -0.936838 | 0.700485 | -0.018173 | -0.781106 |
| 910 | -0.808521 | -0.703615 | 0.779738 | -1.044559 | 0.361828 | -2.090899 |
| 911 | -0.716950 | -0.688527 | -1.809730 | -1.037932 | -0.676336 | 0.160729 |
| 912 | 1.039876 | 0.953581 | -0.219800 | 0.406160 | 0.704469 | -0.254209 |
| 913 | -0.006559 | 0.028951 | -0.504759 | 0.204814 | -0.960616 | -0.239064 |
| 914 | -0.442120 | -0.463660 | -0.074947 | -0.220194 | -0.383694 | 1.738418 |
| 915 | -0.465730 | -0.499411 | -0.873593 | -0.353534 | 0.446968 | -0.635559 |
| 916 | -0.250269 | -0.270598 | 0.108922 | 0.186943 | -1.371520 | -0.248336 |
| 917 | -0.163480 | -0.247813 | -0.444731 | 0.518925 | -1.354754 | -0.635498 |
| 918 | -0.701159 | -0.708582 | 1.747623 | 0.568774 | -0.053367 | 1.745879 |
| 919 | 0.211435 | -0.048376 | 0.212770 | 1.310612 | 1.806197 | 0.513819 |
| 920 | 0.541364 | 0.087950 | 1.214127 | 1.336700 | 2.361296 | 0.216369 |
| 921 | 0.727348 | 0.533581 | 0.145181 | 0.401811 | -0.411531 | 0.995818 |
| 922 | -0.112823 | -0.131954 | -0.221095 | -0.376894 | 0.964732 | 2.203621 |
| 923 | -0.493806 | -0.484812 | -0.808556 | 1.044286 | -1.717595 | 0.093573 |
| 924 | 0.077510 | -0.033107 | 0.029139 | 0.498638 | -0.385264 | 0.672533 |
| 925 | -0.830729 | -0.761913 | 0.622852 | -1.155799 | 0.117107 | 1.180032 |
| 926 | -0.649694 | -0.645483 | -0.779001 | -0.399451 | 0.141012 | 0.308400 |
| 927 | -0.679042 | -0.513083 | -0.343469 | -1.520832 | -1.324044 | -0.973282 |
| 928 | -0.766484 | -0.712160 | 1.223978 | 0.005291 | 1.205971 | 1.753255 |
| 929 | 0.326135 | 0.389157 | 0.216150 | 0.902143 | 1.587099 | -0.770980 |
| 930 | -0.705391 | -0.611802 | -0.640352 | -0.096009 | -0.676256 | -0.662049 |
| 931 | 1.907619 | 2.004142 | 0.830355 | 1.287913 | 0.991206 | -1.623979 |
| 932 | 0.644932 | 0.509068 | -0.006861 | 0.841028 | -0.051434 | 0.768687 |
| 933 | -0.892287 | -0.806602 | 0.604220 | -1.609557 | -0.232170 | 0.664431 |
| 934 | 0.947054 | 0.774741 | 0.113761 | 0.711047 | -0.571025 | 0.671875 |
| 935 | -0.378605 | -0.362831 | -0.568937 | -0.213891 | -1.009617 | -1.054852 |
| 936 | -0.308849 | -0.333699 | 0.705271 | -0.218545 | 0.383797 | 0.418719 |
| 937 | 1.077341 | 0.993105 | 0.988545 | 0.818197 | -0.043841 | 0.310871 |
| 938 | -0.847861 | -0.751674 | 1.405336 | 1.366696 | 0.557775 | -0.303365 |
| 939 | -0.251361 | -0.218652 | 0.286814 | 0.935769 | -0.626429 | -0.592295 |
| 940 | -0.866685 | -0.791014 | -0.660075 | -0.926077 | -0.451779 | 0.622928 |
| 941 | 0.006825 | 0.031590 | 0.135420 | 0.808775 | -0.807778 | -1.049667 |
| 942 | 2.307052 | 2.714570 | -1.810784 | -0.334114 | -0.300526 | -1.901682 |
| 943 | -0.535727 | -0.535938 | -3.999367 | -1.192027 | 0.470705 | -0.376942 |
| 944 | -0.865963 | -0.815162 | 1.772881 | 0.133495 | 1.408815 | 0.665523 |
| 945 | 0.388127 | 0.291530 | 0.345638 | 0.362290 | 1.542232 | 0.826313 |
| 946 | 0.207673 | 0.097366 | 0.195844 | -0.616554 | 0.760306 | 0.487900 |
| 947 | -0.733911 | -0.630207 | 1.498690 | 0.062958 | -2.005173 | -1.416878 |
| 948 | 0.063549 | -0.073764 | 0.191754 | -0.397916 | -0.217057 | 1.692852 |
| 949 | -0.897649 | -0.826610 | -0.157944 | -2.229375 | -0.718339 | 0.219551 |
# define model
rf = RandomForestClassifier(random_state=RSEED)
# define parameter grid
parameters_grid = {
'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_split': [2, 3, 4],
'min_samples_leaf': [2, 4, 6],
'n_estimators': [20, 50, 80, 100]
}
# define grid search
grid_search = GridSearchCV(estimator=rf, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(X_train_scaled, y_train)
# get best estimator
best = grid_search.best_estimator_
# print best parameters
pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T
| max_depth | min_samples_leaf | min_samples_split | n_estimators | |
|---|---|---|---|---|
| Selected Value | 10 | 2 | 2 | 100 |
# predict
y_pred_2010 = best.predict(X_test_scaled_2010)
# calculate metrics
acc = round(accuracy_score(y_test_2010, y_pred_2010), 3)
# display metrics
rf_df = pd.DataFrame([acc]).T
rf_df = rf_df.rename(index={0: 'Random Forest Classifier'}, columns={0: 'Accuracy'})
rf_df
| Accuracy | |
|---|---|
| Random Forest Classifier | 0.82 |
# plot confusion metrics
plot_confusion_matrix(best, X_test_scaled_2010, y_test_2010, cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.show()
# predict
y_pred_2012 = best.predict(X_test_scaled_2012)
# calculate metrics
acc = round(accuracy_score(y_test_2012, y_pred_2012), 3)
# display metrics
rf_df = pd.DataFrame([acc]).T
rf_df = rf_df.rename(index={0: 'Random Forest Classifier'}, columns={0: 'Accuracy'})
rf_df
| Accuracy | |
|---|---|
| Random Forest Classifier | 0.74 |
# plot confusion metrics
plot_confusion_matrix(best, X_test_scaled_2012, y_test_2012, cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.show()
# get feature importance
imp = pd.DataFrame(grid_search.best_estimator_.fit(X_train, y_train).feature_importances_,
index=X_train.columns, columns=['Importance']).sort_values('Importance')
imp = imp.nlargest(6, 'Importance')
imp
| Importance | |
|---|---|
| bachelors_percent | 0.226505 |
| female_percent | 0.225420 |
| VAP | 0.167972 |
| can_vote_percent | 0.137320 |
| candidatevotes | 0.127624 |
| voter_turnout | 0.115160 |
# plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(0, len(imp)), imp['Importance'], color='#9ecae1')
plt.grid(axis='x', alpha=0.5, color='lightgrey')
plt.yticks(range(0, len(imp)), imp.index)
plt.title('Feature Importance', fontsize=14)
plt.show()
# define model
gb = GradientBoostingClassifier(random_state=RSEED)
# define parameter grid
parameters_grid = {
'learning_rate': [0.5, 1, 1.5, 2],
'n_estimators': [50, 100, 200]
}
# define grid search
grid_search = GridSearchCV(estimator=gb, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(X_train_scaled, y_train)
# get best estimator
best = grid_search.best_estimator_
# print best parameters
pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T
| learning_rate | n_estimators | |
|---|---|---|
| Selected Value | 0.5 | 200.0 |
# predict
y_pred_2010 = best.predict(X_test_scaled_2010)
# calculate metrics
acc = round(accuracy_score(y_test_2010, y_pred_2010), 3)
# display metrics
rf_df = pd.DataFrame([acc]).T
rf_df = rf_df.rename(index={0: 'Random Forest Classifier'}, columns={0: 'Accuracy'})
rf_df
| Accuracy | |
|---|---|
| Random Forest Classifier | 0.82 |
# plot confusion metrics
plot_confusion_matrix(best, X_test_scaled_2012, y_test_2012, cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.show()
# predict
y_pred_2012 = best.predict(X_test_scaled_2012)
# calculate metrics
acc = round(accuracy_score(y_test_2012, y_pred_2012), 3)
# display metrics
rf_df = pd.DataFrame([acc]).T
rf_df = rf_df.rename(index={0: 'Random Forest Classifier'}, columns={0: 'Accuracy'})
rf_df
| Accuracy | |
|---|---|
| Random Forest Classifier | 0.68 |
# plot confusion metrics
plot_confusion_matrix(best, X_test_scaled_2012, y_test_2012, cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.show()
# get feature importance
imp = pd.DataFrame(grid_search.best_estimator_.fit(X_train, y_train).feature_importances_,
index=X_train.columns, columns=['Importance']).sort_values('Importance')
imp = imp.nlargest(6, 'Importance')
imp
| Importance | |
|---|---|
| bachelors_percent | 0.242982 |
| female_percent | 0.222382 |
| VAP | 0.178177 |
| can_vote_percent | 0.143347 |
| candidatevotes | 0.126458 |
| voter_turnout | 0.086654 |
# plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(0, len(imp)), imp['Importance'], color='#b3cde3')
plt.grid(axis='x', alpha=0.5, color='lightgrey')
plt.yticks(range(0, len(imp)), imp.index)
plt.title('Feature Importance', fontsize=14)
plt.show()
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train_scaled,y_train)
Fitting 5 folds for each of 48 candidates, totalling 240 fits [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.1s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.3s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.6s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.3s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.9s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.5s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 24.0s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 37.4s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 1.2min [CV] END ........................C=100, gamma=1, kernel=poly; total time= 35.6s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 30.2s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['rbf', 'poly', 'sigmoid']},
verbose=2)
print(grid.best_estimator_)
SVC(C=10, gamma=1)
grid_predictions = grid.predict(X_test_scaled_2010)
print(confusion_matrix(y_test_2010,grid_predictions))
print(classification_report(y_test_2010,grid_predictions))#Output
[[12 2]
[12 24]]
precision recall f1-score support
D 0.50 0.86 0.63 14
R 0.92 0.67 0.77 36
accuracy 0.72 50
macro avg 0.71 0.76 0.70 50
weighted avg 0.80 0.72 0.73 50
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
grid.fit(X_train_scaled, y_train)
perm_importance = permutation_importance(grid, X_test_scaled_2010, y_test_2010)
feature_names = X_train_scaled.columns
features = np.array(feature_names)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
Fitting 5 folds for each of 48 candidates, totalling 240 fits [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.3s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.1s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.3s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.7s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.4s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.0s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.6s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 24.8s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 37.9s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 1.1min [CV] END ........................C=100, gamma=1, kernel=poly; total time= 36.7s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 30.6s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s
Text(0.5, 0, 'Permutation Importance')
grid_predictions = grid.predict(X_test_scaled_2012)
print(confusion_matrix(y_test_2012,grid_predictions))
print(classification_report(y_test_2012,grid_predictions))#Output
[[14 7]
[10 19]]
precision recall f1-score support
D 0.58 0.67 0.62 21
R 0.73 0.66 0.69 29
accuracy 0.66 50
macro avg 0.66 0.66 0.66 50
weighted avg 0.67 0.66 0.66 50
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
grid.fit(X_train_scaled, y_train)
perm_importance = permutation_importance(grid, X_test_scaled_2012, y_test_2012)
feature_names = X_train_scaled.columns
features = np.array(feature_names)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
Fitting 5 folds for each of 48 candidates, totalling 240 fits [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=0.1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=0.1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.3s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.4s [CV] END ..........................C=1, gamma=1, kernel=poly; total time= 0.1s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .........................C=1, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ........................C=1, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ........................C=1, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .......................C=1, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .......................C=1, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ......................C=1, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END ..........................C=10, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.4s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.7s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 3.3s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.9s [CV] END .........................C=10, gamma=1, kernel=poly; total time= 2.6s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ........................C=10, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ......................C=10, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .....................C=10, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END ..................C=10, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END .........................C=100, gamma=1, kernel=rbf; total time= 0.0s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 25.3s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 37.9s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 1.1min [CV] END ........................C=100, gamma=1, kernel=poly; total time= 35.0s [CV] END ........................C=100, gamma=1, kernel=poly; total time= 29.8s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=1, kernel=sigmoid; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END .......................C=100, gamma=0.1, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ......................C=100, gamma=0.1, kernel=poly; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END .....................C=100, gamma=0.01, kernel=poly; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END ..................C=100, gamma=0.01, kernel=sigmoid; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END ....................C=100, gamma=0.001, kernel=poly; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s [CV] END .................C=100, gamma=0.001, kernel=sigmoid; total time= 0.0s
Text(0.5, 0, 'Permutation Importance')
| Model | Accuracy |
|---|---|
| Random Forest | 0.82 |
| Gradient Boosting | 0.82 |
| SVM | 0.72 |
as far as performance goes the Random Forest Forest and Gradient Boosting Clssifiers both take the cake for they have the highest accuracy while the svm model performed surprisingly poorly.
| Model | Accuracy |
|---|---|
| Random Forest | 0.74 |
| Gradient Boosting | 0.68 |
| SVM | 0.66 |
as for the year 2012 the overall performance across all the models has dropped, and this time the Random Forest Classifier has the highest accuracy while SVM has the lowest accuracy, it is also important to note that both SVM and Gradient Boosting have similar performances since their accuracies are close to each other